diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 969f802..1bac6e8 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -33,6 +33,22 @@
         "terraform-ls"
       ],
       "version": "1.8.0"
+    },
+    {
+      "name": "code-intelligence",
+      "source": "./plugins/code-intelligence",
+      "description": "Use when navigating or refactoring code with a language server - choosing between semantic (LSP), exact-text (rg), and fuzzy/semantic search; anchoring LSP calls by position; gating degraded results; and disclosing tool substitutions, in any language.",
+      "category": "development",
+      "keywords": [
+        "lsp",
+        "code-intelligence",
+        "code-navigation",
+        "language-server",
+        "refactoring",
+        "search-precedence",
+        "tool-disclosure"
+      ],
+      "version": "0.1.0"
     }
   ]
 }
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1b60ddd..d8aafd9 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -45,6 +45,10 @@ Agent response: [verbatim or screenshot]
 Improvements: [what improved / patterns now followed]
 ```
 
+- [ ] Ran the plugin's `tests/baseline-scenarios.md` per its
+      `## Running These Tests` (plugin OFF then ON)
+- [ ] Every scenario meets its `### Success Criteria`; no scenario fails
+- [ ] Added/updated a scenario for any new or changed behavior
 - [ ] Agent references new content
 - [ ] Agent applies new patterns proactively
 - [ ] No new rationalizations introduced
diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml
index ab0f196..197b56c 100644
--- a/.github/workflows/validate.yml
+++ b/.github/workflows/validate.yml
@@ -80,6 +80,50 @@ jobs:
           print(f"\n✅ {len(skills)} inline skill file(s) valid")
           EOF
 
+      - name: Validate inline plugin tests
+        run: |
+          python3 << 'EOF'
+          import sys, glob, os
+
+          skills = sorted(glob.glob('plugins/*/skills/*/SKILL.md'))
+          if not skills:
+              print("ℹ️  No inline plugins - nothing to test-gate.")
+              sys.exit(0)
+
+          errors = []
+          for skill in skills:
+              plugin_root = skill.split('/skills/')[0]   # plugins/<plugin>
+              name = plugin_root.split('/', 1)[1]
+              tf = os.path.join(plugin_root, 'tests', 'baseline-scenarios.md')
+              if not os.path.isfile(tf):
+                  errors.append(f"{name}: missing {tf} (regression scenarios "
+                                "are required for inline plugins)")
+                  continue
+              text = open(tf).read()
+              n_scn = text.count('\n## Scenario ')
+              checks = {
+                  "a '## Scenario' section": n_scn >= 1,
+                  "a '## Running These Tests' section":
+                      '\n## Running These Tests' in text,
+                  "a '### Success Criteria' section":
+                      '\n### Success Criteria' in text,
+              }
+              missing = [why for why, ok in checks.items() if not ok]
+              if missing:
+                  errors.append(f"{name}: {tf} needs " + "; ".join(missing))
+              else:
+                  print(f"   ✅ {name}: {n_scn} scenario(s), run protocol present")
+
+          if errors:
+              print("\n".join(f"❌ {e}" for e in errors))
+              print("\nEvery inline plugin must ship "
+                    "tests/baseline-scenarios.md with at least one scenario, "
+                    "a '## Running These Tests' protocol, and "
+                    "'### Success Criteria'. See CONTRIBUTING.md > Testing.")
+              sys.exit(1)
+          print(f"\n✅ {len(skills)} inline plugin test suite(s) present")
+          EOF
+
       - name: Validate marketplace.json
         run: |
           python3 << 'EOF'
@@ -208,7 +252,6 @@ jobs:
             plugins/**/*.md
             README.md
             CONTRIBUTING.md
-        continue-on-error: true
 
       - name: Summary
         if: success()
diff --git a/.gitignore b/.gitignore
index b7977ea..d2d8db0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 .claude/settings.local.json
 docs/
 tmp-*/
+.DS_Store
+*.swp
diff --git a/.markdownlint.jsonc b/.markdownlint.jsonc
new file mode 100644
index 0000000..6fb5aad
--- /dev/null
+++ b/.markdownlint.jsonc
@@ -0,0 +1,15 @@
+{
+  // Repo-wide markdownlint config. markdownlint-cli2 (the CI action)
+  // auto-discovers this file.
+  "default": true,
+  // Prose and tables in this repo intentionally exceed 80 cols (decision
+  // tables, links, scenario text). Line length is not a useful signal here.
+  "MD013": false,
+  // Baseline scenario docs repeat sub-headings ("Test Prompt",
+  // "Success Criteria", ...) under different scenario parents. Allow
+  // duplicates when they are not siblings.
+  "MD024": { "siblings_only": true },
+  // Table pipe spacing/alignment is cosmetic and varies across the repo's
+  // tables; not a useful signal (newer markdownlint only).
+  "MD060": false
+}
diff --git a/CLAUDE.md b/CLAUDE.md
index 6f68172..bdff015 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -76,6 +76,10 @@ newer version, bump `source.ref` and the mirrored `version` in the manifest.
 3. Add `plugins/<plugin>/CHANGELOG.md` (can be empty; CI prepends to it).
 4. The manifest `version` must equal the SKILL.md `metadata.version`. CI
    enforces this.
+5. Add `plugins/<plugin>/tests/baseline-scenarios.md` - **required**, CI
+   enforces it: at least one `## Scenario`, a `## Running These Tests`
+   protocol, and a `### Success Criteria` list. Copy the shape of
+   `plugins/code-intelligence/tests/baseline-scenarios.md`.
 
 ## Development Workflow
 
@@ -116,10 +120,15 @@ grep -oP '\[.*?\]\(references/.*?\.md.*?\)' SKILL.md references/*.md | \
 No automated suite. Manual flow:
 
 1. Edit a `SKILL.md` or `references/*.md` file.
-2. Reload the plugin in your agent host.
-3. Run real queries the skill targets.
-4. Confirm the agent applies the new patterns.
-5. Re-check that plugin's `tests/` for regressions (if it has them).
+2. Run that plugin's `tests/baseline-scenarios.md` per its
+   `## Running These Tests`: each prompt with the plugin OFF (baseline) then
+   ON (target).
+3. Every scenario must meet its `### Success Criteria` with no new
+   rationalizations; one failure blocks the change.
+4. Add or update a scenario whenever a PR adds or changes a behavior.
+5. Attach baseline + target transcripts to the PR (or `/tmp`), never under
+   `plugins/`. Tests are required, not optional - CI fails an inline plugin
+   with no scenario file.
 
 ## Commit Conventions & Releases
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 31628b2..df6b120 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -49,6 +49,9 @@ standards, and the per-plugin release model before contributing.
 3. `plugins/<plugin>/CHANGELOG.md` (may be empty; CI prepends to it).
 4. The manifest `version` must equal the SKILL.md `metadata.version`. CI
    enforces this.
+5. `plugins/<plugin>/tests/baseline-scenarios.md` is **required** and
+   CI-enforced (see Testing). It must contain at least one `## Scenario ...`,
+   a `## Running These Tests` protocol, and a `### Success Criteria` list.
 
 See CLAUDE.md "SKILL.md Architecture" and the "LLM Consumption Rules" for
 content shape and token discipline.
@@ -71,22 +74,55 @@ the squash commit subject is what drives the release; set it deliberately.
 
 ## Testing
 
-This is documentation, not code. There is no build. Validate locally with the
-commands in [CLAUDE.md](CLAUDE.md#validation), then verify behavior:
+Tests are **required**, not optional. This is documentation, not code, so
+"tests" are behavioral regression scenarios run against a real agent host.
 
-1. Reload the plugin in your agent host.
-2. Run real queries the skill targets.
-3. Confirm the agent applies the new patterns and introduces no new
-   rationalizations.
+**Every inline plugin must ship `plugins/<plugin>/tests/baseline-scenarios.md`**
+with this structure (CI fails the PR if it is missing or incomplete):
 
-Content PRs must include baseline (without change) and improved (with change)
-agent transcripts in the PR template.
+```text
+# Baseline Scenarios
+<intro: compare WITHOUT vs WITH the skill>
+
+## Running These Tests
+<the WITHOUT -> WITH -> compare -> gate protocol>
+
+## Scenario 1: <name>
+### Test Prompt
+### Expected Baseline Behavior (WITHOUT skill)
+### Target Behavior (WITH skill)
+### Pressure Variations
+### Success Criteria        <- checkbox list, the pass/fail bar
+## Scenario 2: ...
+```
+
+`plugins/code-intelligence/tests/baseline-scenarios.md` is the canonical
+example - copy its shape.
+
+**Every content PR must:**
+
+1. First validate locally with the commands in
+   [CLAUDE.md](CLAUDE.md#validation).
+2. Run the scenarios per that file's `## Running These Tests`: capture each
+   prompt's output with the plugin OFF (baseline), then ON (target).
+3. Confirm every scenario meets its `### Success Criteria` and introduces no
+   new rationalizations. A single failing scenario blocks the PR.
+4. When a PR adds or changes a behavior, add or update a scenario so the
+   behavior stays covered.
+5. Paste the baseline and target transcripts into the PR template (or `/tmp`) -
+   never commit them under `plugins/`.
 
 ## CI
 
 `validate.yml` runs on every PR touching `plugins/**` or `.claude-plugin/**`:
-frontmatter, size, manifest validity, manifest <-> SKILL.md version sync,
-broken links, and markdown lint. Fix failures before requesting review.
+frontmatter, size, **inline plugin tests present** (baseline-scenarios.md with
+scenarios + run protocol + success criteria), manifest validity, manifest <->
+SKILL.md version sync, broken links, and markdown lint.
+
+The **Validate Skill Files** check is a **required status check** on `master`
+(branch protection): a PR cannot be merged while it is failing. Every check
+above is blocking - markdown lint included (no `continue-on-error`). Fix all
+failures; do not request review or merge with a red check.
 
 ## Reporting Issues
 
diff --git a/README.md b/README.md
index 1634de0..622f80c 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,31 @@ Plugins are either **external** (referenced from their own repo) or **inline**
 | Plugin | Type | Description |
 |--------|------|-------------|
 | [terraform-skill](https://github.com/antonbabenko/terraform-skill) | external | Writing, reviewing, and debugging Terraform/OpenTofu modules, tests, CI, scans, and state ops. Pinned via `source.ref`. |
+| [code-intelligence](plugins/code-intelligence/skills/code-intelligence/SKILL.md) | inline | Language-agnostic code navigation discipline: when to use a language server vs exact-text vs fuzzy search, position-anchored LSP calls, a degradation gate, and first-line tool-substitution disclosure. |
+
+## Why these plugins
+
+These are not prose guides - they are executable discipline the agent loads on
+demand and applies while it works.
+
+- **Fewer wrong tools, fewer silent failures.** `code-intelligence` stops the
+  common failure modes directly: blind text-replace renames, accepting "the
+  tool is broken" without proof, presenting a keyword grep as a complete
+  answer. `terraform-skill` routes a request to its actual failure mode
+  (identity churn, secret exposure, blast radius, state corruption) before
+  generating code.
+- **Honest by construction.** Any tool substitution or skipped step is stated
+  on the first line of the response, not buried later - so you can trust what
+  the agent says it did.
+- **Token-lean.** Progressive disclosure: a short `SKILL.md` entry point routes
+  to reference files that load only when the task needs them. The agent does
+  not carry the whole guide in context.
+- **Portable.** One discipline across Claude Code, Cursor, Copilot, Gemini CLI,
+  OpenCode, and Codex - no per-host retraining.
+- **Composable and pinned.** Generic skills (`code-intelligence`) provide the
+  base discipline; domain skills (`terraform-skill`) extend it. Each plugin is
+  versioned and released independently, so an upgrade to one never moves
+  another.
 
 ## Installation
 
@@ -33,8 +58,10 @@ directory, for example:
 
 ```bash
 git clone https://github.com/antonbabenko/agent-plugins.git
-# Claude Code (manual): symlink a plugin into ~/.claude/plugins
-ln -s "$(pwd)/agent-plugins/plugins/terraform-skill" ~/.claude/plugins/terraform-skill
+# Inline plugins live under plugins/<name>/ - symlink one into ~/.claude/plugins:
+ln -s "$(pwd)/agent-plugins/plugins/code-intelligence" ~/.claude/plugins/code-intelligence
+# External plugins (e.g. terraform-skill) are not in this repo - install them
+# from their own repo / marketplace ref instead.
 ```
 
 For per-host instructions (Cursor, Copilot, Gemini CLI, OpenCode, Codex,
diff --git a/plugins/code-intelligence/CHANGELOG.md b/plugins/code-intelligence/CHANGELOG.md
new file mode 100644
index 0000000..74b413e
--- /dev/null
+++ b/plugins/code-intelligence/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+
+All notable changes to the `code-intelligence` plugin are documented here.
+This file is managed by the per-plugin release pipeline; entries are prepended
+on release.
+
+## [Unreleased]
+
+- Initial plugin: generic LSP / search-precedence code-intelligence skill.
diff --git a/plugins/code-intelligence/skills/code-intelligence/SKILL.md b/plugins/code-intelligence/skills/code-intelligence/SKILL.md
new file mode 100644
index 0000000..3d6c83f
--- /dev/null
+++ b/plugins/code-intelligence/skills/code-intelligence/SKILL.md
@@ -0,0 +1,85 @@
+---
+name: code-intelligence
+description: Use when navigating or refactoring code with a language server - choosing between semantic (LSP), exact-text (rg), and fuzzy/semantic search; anchoring LSP calls by position; gating degraded results; and disclosing tool substitutions, in any language.
+license: Apache-2.0
+metadata:
+  author: Anton Babenko
+  version: 0.1.0
+---
+
+# Code Intelligence
+
+Pick the search tool by task, not by habit. Generic and language-agnostic;
+domain skills extend it with server capability matrices and ecosystem
+prerequisites. It is model-triggered guidance, not enforcement.
+
+## Tool Precedence
+
+| Goal | Use | Tradeoff |
+|------|-----|----------|
+| Symbol relationships: definition, references, call sites, rename safety | Language server (LSP) at a position | Needs a running server + indexed workspace |
+| Exact text, known name, exhaustive enumeration, config/value files | `rg` then Read | No semantic scope; matches strings in comments too |
+| Conceptual / fuzzy / "where might this live" / cross-repo discovery | A semantic/neural search tool, if the host provides one | Not exact; never use for counts or completeness claims |
+
+Detail: [Precedence Table](references/tool-precedence.md#precedence-table),
+[When LSP Is Wrong](references/tool-precedence.md#when-lsp-is-wrong).
+
+## Calling the LSP
+
+- DO call at a position (`file:line:character`). Anchor the position with a
+  text search for a known occurrence first.
+- DON'T pass a bare symbol name and expect resolution. A name-only call that
+  returns empty is a usage defect, not server failure.
+- DO Read the returned locations for source text; LSP returns locations and
+  symbols, not the lines.
+- DO retry once on a cold start: the first call after launch may return empty
+  while the server indexes.
+- DO prefer the server's own operation when it advertises it: use `rename` /
+  `prepareRename` for renames and call hierarchy for callers - they carry
+  language-specific semantics a manual pass misses.
+- DON'T report an unsupported operation as a finding. When the server lacks
+  one, redirect: `findReferences` (then filter to call sites) instead of call
+  hierarchy; enumerate references then hand-edit instead of a rename provider.
+
+Detail: [Position Anchoring](references/lsp-calls.md#position-anchoring),
+[Unsupported Operations](references/lsp-calls.md#unsupported-operations).
+
+## Degradation Gate
+
+Two distinct cases:
+
+- **No LSP at all** (host exposes no language-server tool, or the server fails
+  to start): that IS unavailability. Disclose it on the first line (see below)
+  and use text search. The gate does not apply - there is nothing to gate.
+- **LSP callable but a position-anchored call returns empty:** do NOT conclude
+  "unavailable" yet. Pass ALL three:
+  1. `documentSymbol` on an in-scope file returns symbols -> server responsive
+     (responsiveness only, NOT proof of complete reference coverage).
+  2. The failing call was position-anchored (not symbol-name-only).
+  3. That anchored call still returned empty after a cold-start retry.
+
+Only after the three-part case passes is a disclosed text fallback warranted.
+
+Detail: [Degradation Gate](references/degradation-and-disclosure.md#degradation-gate).
+
+## Disclose Substitutions
+
+State any tool substitution OR omission on the FIRST line of the response, not
+in a later summary (post-hoc accounting is a rule violation):
+
+`Intended: <tool>. Actual: <tool>. Reason: <why>. Impact: <completeness/confidence>.`
+
+Detail: [Disclosure Format](references/degradation-and-disclosure.md#disclosure-format).
+
+## Do Not Invent a Missing Tool
+
+Before claiming a tool (e.g. `rg`) is shimmed, aliased, or absent, prove it:
+`type -a <tool>`, `ls -l` the resolved path, `<tool> --version` shows the
+expected banner. An unproven "tool is missing" claim followed by a fallback is
+a verification failure, not a sanctioned substitution.
+
+If genuinely absent or aliased: prefer the LSP for semantic tasks; for exact
+text use the host-approved text search; `git grep` / `grep` only as an
+explicitly disclosed last resort, never the default substitute.
+
+Detail: [Anti-Phantom-Shim Proof](references/degradation-and-disclosure.md#anti-phantom-shim-proof).
diff --git a/plugins/code-intelligence/skills/code-intelligence/references/degradation-and-disclosure.md b/plugins/code-intelligence/skills/code-intelligence/references/degradation-and-disclosure.md
new file mode 100644
index 0000000..3e66281
--- /dev/null
+++ b/plugins/code-intelligence/skills/code-intelligence/references/degradation-and-disclosure.md
@@ -0,0 +1,62 @@
+# Degradation and Disclosure
+
+What to prove before falling back, how to announce a fallback, and how to
+prove a tool is really missing.
+
+## Degradation Gate
+
+First separate two cases:
+
+- **No LSP at all**: the host exposes no language-server tool, or the server
+  will not start. This is genuine unavailability - the gate does not apply.
+  Disclose on the first line (see Disclosure Format) and use text search.
+- **LSP callable, position-anchored call returns empty**: a degraded or
+  unindexed workspace can legitimately do this. Do not conclude "unavailable" -
+  run the gate.
+
+Gate (second case only). Pass ALL three before claiming "LSP degraded, using
+text search":
+
+1. `documentSymbol` on an in-scope file returns symbols. The server is
+   responsive. This proves responsiveness ONLY, not complete reference
+   coverage.
+2. The failing call was position-anchored, not symbol-name-only.
+3. That anchored call still returned empty after a cold-start retry.
+
+All three pass -> a disclosed text fallback is warranted. Any fails -> fix the
+call or the setup; do not fall back yet.
+
+Distinguish: a name-only call returning empty is a usage defect (gate fails at
+2). A position-anchored call on a responsive server returning empty is genuine
+degradation (gate passes).
+
+## Disclosure Format
+
+State any tool substitution OR omission on the FIRST line of the response:
+
+`Intended: <tool>. Actual: <tool>. Reason: <why>. Impact: <completeness/confidence>.`
+
+- Covers substitution (used a different tool) AND omission (skipped a step or
+  scope).
+- First line, same response. A later closing summary is a rule violation - the
+  reader must see the caveat before the conclusion.
+- One line, factual, no hedging. The impact clause states what confidence is
+  lost (e.g. "text matches only, may include comments/strings").
+
+## Anti-Phantom-Shim Proof
+
+Do not claim a tool is shimmed, aliased, replaced, or missing without proof.
+Verify before asserting:
+
+1. `type -a <tool>` - resolve what actually runs.
+2. `ls -l <resolved-path>` - confirm the binary exists and is executable.
+3. `<tool> --version` - confirm it prints the expected banner.
+
+If it prints the expected version, the tool is real - investigate the
+execution context (sandbox, PATH, shell) before any fallback. An unproven
+"tool is missing" claim followed by a fallback is a verification failure, not
+a sanctioned substitution.
+
+If genuinely absent or aliased: prefer the LSP for semantic tasks; for exact
+text use the host-approved text search; `git grep` / `grep` only as an
+explicitly disclosed last resort, never the default substitute.
diff --git a/plugins/code-intelligence/skills/code-intelligence/references/lsp-calls.md b/plugins/code-intelligence/skills/code-intelligence/references/lsp-calls.md
new file mode 100644
index 0000000..8baf5f7
--- /dev/null
+++ b/plugins/code-intelligence/skills/code-intelligence/references/lsp-calls.md
@@ -0,0 +1,57 @@
+# LSP Calls
+
+Generic mechanics for driving a language server. Operation names follow LSP:
+`goToDefinition`, `findReferences`, `hover`, `documentSymbol`,
+`workspaceSymbol`, `goToImplementation`, call hierarchy. Availability is
+host-gated - the host decides whether an LSP tool is exposed at all.
+
+## Position Anchoring
+
+The server resolves by source position, not by symbol name.
+
+- Call with `file:line:character` pointing at an occurrence of the symbol.
+- Find that occurrence first with a text search (a known use or the
+  declaration), then issue the LSP call at that location.
+- A bare-name call is unsupported; an empty result from one is a usage defect,
+  not degradation.
+
+Example: to find callers of `parseConfig`, `rg -n 'parseConfig'` to get a
+line, then `findReferences` at that line/column. Works the same whether the
+language is Go, Python, or TypeScript.
+
+## Cold Start And Retry
+
+The first call after the server launches may return empty or partial while it
+indexes the workspace.
+
+- Retry the same call once after a short pause before drawing any conclusion.
+- A still-empty result after retry feeds the degradation gate; it is not
+  immediate proof the server is broken.
+
+## Unsupported Operations
+
+Not every server implements every operation. `goToImplementation`, call
+hierarchy (`prepareCallHierarchy` / `incomingCalls` / `outgoingCalls`), and
+rename are commonly absent.
+
+- DO check advertised capabilities first. When the server supports `rename` /
+  `prepareRename` or call hierarchy, use it - it carries language semantics a
+  manual pass misses.
+- DON'T call an unsupported operation and report its absence as a finding.
+- DO redirect only when the operation is genuinely unsupported: `findReferences`
+  (filtered to call sites) instead of call hierarchy; enumerate references then
+  edit by hand instead of a rename provider.
+- DON'T guess support - confirm via advertised capabilities or a language skill
+  that documents them.
+
+## Reading Results
+
+LSP returns locations and symbols, not source lines.
+
+- After `goToDefinition` / `findReferences`, Read each returned location to see
+  and act on the actual code.
+- For multi-edit changes in one file, Read that file again immediately before
+  each edit - earlier edits shift line/character offsets and a stale view
+  produces corrupted edits.
+- `documentSymbol` returns a structural outline; use it as a liveness probe
+  and to navigate, not as a reference set.
diff --git a/plugins/code-intelligence/skills/code-intelligence/references/tool-precedence.md b/plugins/code-intelligence/skills/code-intelligence/references/tool-precedence.md
new file mode 100644
index 0000000..55f4033
--- /dev/null
+++ b/plugins/code-intelligence/skills/code-intelligence/references/tool-precedence.md
@@ -0,0 +1,49 @@
+# Tool Precedence
+
+LSP for symbol meaning, text search for literals, semantic search for fuzzy
+discovery. The three are not interchangeable.
+
+## Precedence Table
+
+| Task | Tool | Why |
+|------|------|-----|
+| Where is this symbol defined? | LSP `goToDefinition` at a use site | Resolves scope, imports, shadowing - text search cannot |
+| Every reference of a symbol | LSP `findReferences` at the symbol | Excludes same-named-but-unrelated tokens; includes definition/imports/reads/writes, not only calls |
+| Callers specifically | LSP call hierarchy if the server supports it, else `findReferences` filtered to call sites | `findReferences` alone is broader than callers |
+| Rename | LSP `rename` / `prepareRename` if supported, else `findReferences` + per-file manual edits | Server rename carries language semantics; manual edits hit comments/strings/unrelated scopes if not filtered |
+| Exact literal, error string, config key | `rg` then Read | Deterministic, fast, complete for text |
+| Enumerate all matches / count occurrences | `rg` | Exact and exhaustive; semantic search drops matches |
+| "Where is auth handled?", "which module owns X" | Semantic/neural search (if host provides) | Intent-level, no exact symbol to anchor on |
+
+A directive that says one search tool replaces all search applies to broad
+discovery only. It does not override LSP for symbol work or `rg` for exact
+enumeration.
+
+## When LSP Is Wrong
+
+Skip the LSP and go straight to `rg` + Read for:
+
+- Exact text or a known literal you can match directly.
+- Known-name lookup where you already have the file and just need the line.
+- Config / value files (data, not a symbol graph).
+- Comments, generated docs, lockfiles, changelogs.
+- Any file the language server does not index (non-source, vendored output).
+
+LSP answers "what does this symbol mean and where is it used", not "where does
+this string appear". Using it for the latter is slower and no more accurate.
+
+## Semantic Search Scope
+
+Semantic / neural search is for conceptual discovery when there is no exact
+token to anchor on: "where is rate limiting", "which package handles billing".
+
+- DO use it to locate a starting area, then switch to LSP or `rg` for precision.
+- DON'T use it for exhaustive enumeration or any count - it drops exact
+  matches and cannot prove completeness.
+- DON'T cite its results as "all" of anything. Treat output as leads, not a
+  closed set.
+
+Example: "find everywhere we validate JWTs" - semantic search points at the
+auth package; `rg 'jwt'` plus LSP `findReferences` on the verifier function
+(see [Position Anchoring](lsp-calls.md#position-anchoring)) gives the complete
+set.
diff --git a/plugins/code-intelligence/tests/baseline-scenarios.md b/plugins/code-intelligence/tests/baseline-scenarios.md
new file mode 100644
index 0000000..4bb3a21
--- /dev/null
+++ b/plugins/code-intelligence/tests/baseline-scenarios.md
@@ -0,0 +1,141 @@
+# Baseline Scenarios
+
+Compare agent behavior WITHOUT vs WITH the code-intelligence skill. Run each
+prompt without the skill (baseline), then with it (target). Capture transcripts
+in the PR body or /tmp - never inside the plugin.
+
+> markdownlint note: repeated `Test Prompt` / `Pressure Variations` /
+> `Success Criteria` sub-headings are inherent to the multi-scenario format;
+> `.markdownlint.jsonc` sets `MD024: siblings_only` and disables `MD013`, so
+> this format lints clean.
+
+## Running These Tests
+
+Required for every content PR that touches this plugin. No build; this is a
+behavioral A/B against a real agent host.
+
+1. **Baseline (WITHOUT the change).** Use an agent host where this plugin is
+   NOT loaded - uninstall it, disable it, or use a separate profile. For each
+   scenario below, paste the **Test Prompt** verbatim and save the agent's full
+   response.
+2. **Target (WITH the change).** Load the plugin (your branch). Re-run the
+   exact same prompts.
+3. **Compare.** For each scenario, the WITH run must satisfy every box under
+   **Success Criteria** and introduce no new rationalizations. Also run the
+   listed **Pressure Variations** - the behavior must hold under pressure.
+4. **Gate.** ALL scenarios must pass. A single failing scenario blocks the PR.
+5. **Evidence.** Put the baseline and target transcripts in the PR body (or
+   `/tmp`) - never inside `plugins/`. Fill the PR template's testing section.
+
+New scenarios: when a PR adds or changes a behavior, add or update a scenario
+here so the behavior stays covered. CI fails an inline plugin that has no
+`tests/baseline-scenarios.md` with scenarios, this run protocol, and success
+criteria.
+
+## Scenario 1: Find Callers and Rename Safely
+
+**Objective:** Verify the agent uses semantic navigation and a safe rename pass
+instead of blind text replacement.
+
+### Test Prompt
+
+```text
+Rename the function `parseConfig` to `loadConfig` everywhere it is used in this project.
+```
+
+### Expected Baseline Behavior (WITHOUT skill)
+
+- Single `grep`/`rg` for `parseConfig`, then in-place replace across matches
+- Replaces occurrences in comments/strings/unrelated same-named symbols
+- Rationalization: "search found all of them"
+
+### Target Behavior (WITH skill)
+
+- Anchors a position via text search, then LSP `findReferences`
+- Reads each file immediately before editing it (offsets shift)
+- Distinguishes the target symbol from unrelated same-named tokens
+- Discloses on line 1 if it falls back to text search after the gate
+
+### Pressure Variations
+
+- Time: "just do a find-and-replace, it's quick"
+- Authority: "I already know every call site, just rename"
+- Sunk cost: after a blind replace, "did that catch all of them?"
+
+### Success Criteria
+
+- [ ] Uses LSP `findReferences` at a position (or discloses fallback on line 1 after passing the degradation gate)
+- [ ] Fresh-reads each file before editing
+- [ ] Does not claim an unsupported operation as a finding
+- [ ] No edits to comments/strings/unrelated symbols
+
+---
+
+## Scenario 2: Tool Claimed Missing
+
+**Objective:** Verify the agent proves a tool is absent before substituting and
+discloses any substitution.
+
+### Test Prompt
+
+```text
+rg seems broken in this environment, just use the other search tool to find every call to `chargeCard`.
+```
+
+### Expected Baseline Behavior (WITHOUT skill)
+
+- Accepts the claim, silently switches to another tool
+- No verification that rg is actually unavailable
+- No first-line disclosure of the substitution
+
+### Target Behavior (WITH skill)
+
+- Runs the anti-phantom-shim proof (`type -a rg`, `ls -l` resolved path, `rg --version`)
+- If rg is real: uses it and says so; if genuinely absent: prefers LSP for the semantic part
+- Discloses any substitution on the first line in the standard format
+
+### Pressure Variations
+
+- Authority: "trust me, rg is broken, don't waste time checking"
+- Time: "we don't have time to verify, just switch"
+
+### Success Criteria
+
+- [ ] Verifies tool reality before substituting
+- [ ] First-line disclosure when substituting or omitting
+- [ ] Does not present `grep` as the default substitute
+
+---
+
+## Scenario 3: Fuzzy Discovery
+
+**Objective:** Verify the agent uses the semantic tier for conceptual questions
+and does not over-claim completeness.
+
+### Test Prompt
+
+```text
+Where in this codebase is user authentication handled?
+```
+
+### Expected Baseline Behavior (WITHOUT skill)
+
+- Single grep for "auth", presents partial hits as the full answer
+- Or claims an exact count of "all" auth code from a fuzzy search
+
+### Target Behavior (WITH skill)
+
+- Uses semantic/neural search (if host provides) to locate the area
+- Switches to LSP/`rg` to confirm specifics
+- Frames semantic results as leads, makes no exact-count or completeness claim from them
+
+### Pressure Variations
+
+- "just give me the one file that does auth"
+- "how many places exactly - give me the number"
+
+### Success Criteria
+
+- [ ] Uses the semantic tier if the host provides one; otherwise discloses the fallback to text search on the first line - does not default to LSP for this conceptual question
+- [ ] No exact-count or "this is all of it" claim from semantic search
+- [ ] Narrows with LSP/`rg` before asserting specifics