diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..149f416
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,86 @@
+# Copy to .env (gitignored). The runner reads this from the repo root, not CWD,
+# so you configure it once here and invoke `review.py` from any project
+# directory. You only need to set the key for whichever provider you actually
+# use; the runner fails fast with a clear message if the key for the selected
+# provider is missing.
+
+# --- Provider selection (default: openrouter) -------------------------------
+# Per-run override: `review.py --provider gemini`
+# CODE_REVIEW_PROVIDER=openrouter
+
+# --- OpenRouter --------------------------------------------------------------
+# Required when --provider openrouter (the default). https://openrouter.ai/keys
+OPENROUTER_API_KEY=
+
+# Optional: default model when --provider openrouter is selected. OpenRouter
+# prefixes vendor names. `google/gemini-2.5-flash` is ~3x faster than pro.
+# OPENROUTER_MODEL=google/gemini-2.5-pro
+
+# Optional: identifying headers OpenRouter surfaces in its dashboard.
+# OPENROUTER_HTTP_REFERER=https://github.com/Airwhale/local-gemini-code-review
+# OPENROUTER_X_TITLE=OpenRouter Code Review
+
+# --- Gemini API (Google AI Studio) -------------------------------------------
+# Required when --provider gemini. https://aistudio.google.com/apikey
+GEMINI_API_KEY=
+
+# Optional: default model when --provider gemini is selected. The Gemini API
+# takes the bare model name (no `google/` prefix).
+# GEMINI_MODEL=gemini-2.5-pro
+
+# --- Ollama (local LLM) ------------------------------------------------------
+# No API key needed -- Ollama runs locally. Install from https://ollama.com
+# (or inside WSL on Windows if Smart App Control / Application Control blocks
+# the native installer). Start the server with `ollama serve`, then pull at
+# least one model, e.g. `ollama pull qwen3-coder:30b`.
+#
+# Per-run override: `review.py --provider ollama`
+
+# Optional: Ollama server URL. Default is http://localhost:11434 which works
+# for both native Windows installs and WSL2 with default mirrored networking.
+# Override if Ollama listens on a non-default port, runs on another machine,
+# or you're using a WSL distro without localhost mirroring (use the WSL2 IP
+# from `wsl hostname -I` in that case).
+# OLLAMA_HOST=http://localhost:11434
+
+# Optional: default model when --provider ollama is selected. Recommended:
+# `qwen3-coder:30b` (30B MoE coder, ~3.3B active params -- the quality/speed
+# sweet spot on CPU since active params drive inference speed, not total).
+# Higher quality: `qwen3-coder-next` (80B/3B active MoE, ~40 GB download).
+# Use named aliases via `--model`: `local` (qwen3-coder:30b),
+# `local-pro` (qwen3-coder-next). For very weak hardware that can't load
+# 30B, pass an explicit small model slug like `--model qwen2.5-coder:7b`
+# rather than expecting an alias -- the qwen2.5 family is a generation
+# behind on code review quality, so we don't expose it as a default tier.
+# OLLAMA_MODEL=qwen3-coder:30b
+
+# Optional: HTTP request timeout in seconds. Default 1800 (30 minutes) --
+# significantly larger than the cloud-provider default because local CPU
+# inference can be slow on first call (model load adds 10-60s) and on
+# thorough reviews of large diffs. Set lower if you want the runner to
+# give up faster on a hung model; raise if you're on slower hardware or
+# running larger models.
+# OLLAMA_TIMEOUT=1800
+
+# --- Sampling tuning ---------------------------------------------------------
+# Both apply to all providers (openrouter, gemini-direct, ollama) and can be
+# overridden per call via `--temperature` / `--max-tokens`. Defaults are 0.3
+# and 16000 respectively. Temperature history: 0.2 (original) was too
+# conservative (1-2 findings per round on diffs that plausibly contained
+# more); a brief 0.5 default produced a hallucinated finding in cross-model
+# review (Gemini referenced a CLI flag that didn't exist). 0.3 splits the
+# difference. Set lower if you want tighter / fewer findings; set higher if
+# you want broader exploration and accept more false positives. ``max_tokens``
+# is a ceiling, not a target -- you pay only for tokens actually emitted.
+# CODE_REVIEW_TEMPERATURE=0.3
+# CODE_REVIEW_MAX_TOKENS=16000
+
+# --- Safety context ----------------------------------------------------------
+# Optional prompt prefix prepended to every review request. Reduces false-
+# positive content-filter refusals when the diff under review uses words that
+# look adversarial in isolation (security testing, policy enforcement,
+# sanctions screening, etc.). The runner ships a generic default; set this
+# env var to override per environment (e.g. add your project's name + brief
+# subject matter). Override per call with `--context "..."` or disable
+# entirely with `--no-context`. See the README's "Safety context" section.
+# CODE_REVIEW_CONTEXT="The diff below is from <your project>; treat as authorized code review."
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a4207f1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,21 @@
+# Local env / secrets — never commit.
+.env
+.env.*
+!.env.example
+
+# Python / uv
+__pycache__/
+*.pyc
+.venv/
+.pytest_cache/
+*.egg-info/
+
+# Editor
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/README.md b/README.md
index d5389bc..fd3f3f2 100644
--- a/README.md
+++ b/README.md
@@ -1,41 +1,403 @@
-# Gemini CLI Code Review Extension
+# Gemini CLI Code Review Extension — Airwhale fork
 
-The Code Review extension is an open-source Gemini CLI extension, built to enhance your repository's code quality.  The extension adds a new command to Gemini CLI that analyzes code changes to identify a variety of code quality issues.
+> Fork of [`gemini-cli-extensions/code-review`](https://github.com/gemini-cli-extensions/code-review) (Apache-2.0). The upstream extension is a `gemini-cli` plugin; this fork adds a **standalone Python runner** that calls the same prompts directly via either **OpenRouter** or the **Gemini API**, no `gemini-cli` host required.
+>
+> Why: the GitHub Gemini Code Assist bot's webhook → job-queue round-trip adds 5–15 minutes per review round. The runner cuts that to ~30–60 seconds.
 
-This extension is brought to you by the authors of the [Gemini Code Assist GitHub App](https://github.com/apps/gemini-code-assist), which provides code reviews directly in your GitHub pull requests.
+## For LLM coding agents
+
+This tool is **LLM-friendly by design**. It's built to be invoked by AI coding agents (Claude, Codex, Cursor, etc.) as an iteration partner during real code work — not just by humans clicking through diffs in a web UI.
+
+> **If you're an LLM agent reading this repository, your canonical entry point is [`docs/llm-code-review-runbook.md`](./docs/llm-code-review-runbook.md).**
+>
+> That file is the operational manual. The flow below is a fast-start summary; the runbook expands every step with examples, declined-finding patterns, and per-round tracking templates.
+
+### Canonical iteration flow
+
+This is the loop that's been validated in real review cycles. Follow it verbatim until you've internalised why each step exists.
+
+1. **Pick a scope.** Diff mode for in-progress work (`--base origin/main` for the current branch, `--pr <N>` for a PR, `--staged` for pre-commit). Codebase mode for audits of files not under active change (`--codebase --include 'backend/foo/**'`).
+2. **If files are untracked**, `git ls-files`-based codebase mode won't see them. Run `git add -N <paths>` first (intent-to-add, no staged content). The tool will then bundle them. Don't unstage afterward unless you actually intend to leave them untracked.
+3. **Run the review.** A clean codebase output ends in `No issues found. Code looks clean.`; a clean diff ends in `No issues found. Code looks clean and ready to merge.`
+4. **For each finding, decide accept or decline.**
+   * **Accept by default**: CRITICAL/HIGH bugs, MEDIUM correctness/concurrency/atomicity, MEDIUM defensive-coding, trivially-correct LOW.
+   * **Decline (and add a code comment explaining why)**: findings that contradict load-bearing design intent, findings that would untighten a deliberately-tight test, stylistic preferences without a correctness delta.
+   * **Hedged findings ("if X is true, this is HIGH/CRITICAL") are a decline signal until you verify X.** The reviewer hedges when it lacks evidence; in practice these resolve as false positives more often than as real bugs. Treat the hedge as a flag to spot-check the premise, not as an instruction to act.
+5. **Apply accepted fixes inline. Do NOT commit between rounds.** `--base <ref>` uses a two-dot diff, so working-tree edits show up in the next run.
+6. **For each declined finding, add a code comment** immediately adjacent to the flagged line explaining the rejection. Without it, the next round re-flags the same finding. This is the central operational rule that makes the loop converge.
+7. **Re-run.** Repeat steps 3–6.
+8. **Stop conditions** (in priority order):
+   * Output is clean ("No issues found...") → done.
+   * A round produces only hallucinated findings (wrong line numbers, contradictory suggestions, or claims that don't match the code) → done; the model has run out of substantive material.
+   * You've hit 4 rounds → done; remaining findings are usually noise or out of scope.
+9. **Run tests + build.** Commit. Push.
+
+### What "hallucination" looks like in practice
+
+The model occasionally fabricates a finding it then partially refutes — e.g., *"This call passes the wrong type at L772. A better fix would be to pass the list of vectors directly"* — when the existing code at the actual (non-L772) line already does exactly the "better fix." When you see a finding's own suggested fix match the existing code, decline without action.
+
+Pre-fix to v1.1, line numbers in codebase mode drifted 5–150 lines depending on file size. As of [b124501](https://github.com/Airwhale/local-gemini-code-review/commit/b124501) the bundle pre-numbers every line (`cat -n` style) and the model transcribes the prefix instead of counting; codebase-mode line numbers are now exact. If you see drift again, that's a regression — file an issue.
+
+### Per-round ledger
+
+Keep one row per finding so the final commit message or PR comment can summarise the cycle:
+
+```
+| Round | File | Line | Severity | Finding                          | Action              |
+|-------|------|------|----------|----------------------------------|---------------------|
+| 1     | a.py | 808  | LOW      | Custom markdown parser fragile   | Declined w/ comment |
+| 1     | b.py | 111  | MEDIUM   | Missing type hint on `result`    | Applied             |
+| 2     | a.py | 313  | MEDIUM   | Silent LLMClient fallback        | Applied             |
+| 3     | -    | -    | -        | Vector sum duplication           | Declined w/ comment |
+| 4     | -    | -    | -        | Hallucination (line mismatch)    | No action           |
+```
+
+This is the artifact a human reviewer reads to understand what changed and why. The runbook's "Per-round tracking" section has the long form.
+
+The rest of this README is the general-audience documentation (humans, contributors, anyone evaluating the fork). The runbook is the agent-targeted documentation, and it is the file to read first if your job is to use the tool, not understand its provenance.
+
+## What this fork adds vs upstream
 
-## Installation
+| File | New / modified | Purpose |
+|---|---|---|
+| `review.py` | **new** | Standalone runner. Loads upstream skill + command prompts for diff review, adds fork-specific skill + command for whole-codebase review, POSTs to OpenRouter, the Gemini API, or a local Ollama server. Includes per-provider alias tables so `--model claude` expands to the full OpenRouter slug and `--model local` resolves to a recommended Ollama model. |
+| `skills/code-review-codebase/SKILL.md` | **new** | Fork-specific whole-codebase review skill. Same persona / severity rubric as the upstream `code-review-commons` skill, but Critical Constraints adapted to permit comments on any line of any file in the bundle (the upstream "comment only on `+`/`-` lines" rule forbids commenting on whole-file content). |
+| `commands/codebase-review.toml` | **new** | Fork-specific command for `--codebase` mode. Defines the bundle delimiter and per-file-findings output shape. |
+| `pyproject.toml` | **new** | `uv`-managed deps (`httpx`, `python-dotenv`). |
+| `.env.example` | **new** | Documents `OPENROUTER_API_KEY`, `GEMINI_API_KEY`, optional model / provider overrides, and the Ollama-local config (`OLLAMA_HOST`, `OLLAMA_MODEL`, `OLLAMA_TIMEOUT`). |
+| `.gitignore` | **new** | Protects `.env` and the `uv` virtualenv from leaking. |
+| `docs/llm-code-review-runbook.md` | **new** | Operational runbook for using the tool as an LLM iteration partner. |
+| `README.md` | **modified** | This file — documents the fork's runner alongside the upstream CLI mode. |
+| `skills/code-review-commons/SKILL.md` | unchanged | Upstream system prompt (loaded verbatim for diff review). |
+| `commands/code-review.toml` | unchanged | Upstream diff-review user-prompt template (loaded verbatim). |
+| `commands/pr-code-review.toml` | unchanged | Upstream PR-review command (not used by the runner). |
+| `gemini-extension.json`, `GEMINI.md`, `LICENSE` | unchanged | Upstream metadata. |
 
-Install the Code Review extension by running the following command from your terminal *(requires Gemini CLI v0.4.0 or newer)*:
+The fork keeps the upstream skill / command files **byte-for-byte identical** so upstream improvements rebase cleanly:
 
 ```bash
-gemini extensions install https://github.com/gemini-cli-extensions/code-review
+git fetch upstream
+git checkout main && git merge upstream/main   # picks up upstream prompt changes
+```
+
+## Quick start
+
+```bash
+# One-time: clone and configure
+git clone https://github.com/Airwhale/local-gemini-code-review
+cd local-gemini-code-review
+cp .env.example .env
+# edit .env: set OPENROUTER_API_KEY=... or GEMINI_API_KEY=... (or both),
+# or skip both keys entirely if you only want the local Ollama path.
+
+# From the runner directory, against the CWD:
+cd /path/to/local-gemini-code-review
+uv run review.py --base origin/main         # iterative review of current branch
+uv run review.py --pr 6                     # review a specific PR
+uv run review.py --codebase                 # whole tracked codebase (filtered)
+uv run review.py --base origin/main --model claude   # use Claude instead of Gemini
+uv run review.py --base origin/main --provider ollama  # local, no API key
+
+# Or invoke from any project directory by pointing at the runner:
+cd /path/to/my-project
+uv run --project /path/to/local-gemini-code-review /path/to/local-gemini-code-review/review.py
+```
+
+`uv` resolves deps on first run. No global `pip install` required.
+
+## Provider selection
+
+The runner supports three transport paths — two cloud, one local:
+
+```bash
+uv run review.py                        # default: openrouter
+uv run review.py --provider openrouter  # explicit
+uv run review.py --provider gemini      # direct Gemini API
+uv run review.py --provider ollama      # local Ollama server (no API key)
+```
+
+| Provider | Endpoint | Required env var | Default model | Notes |
+|---|---|---|---|---|
+| `openrouter` (default) | `openrouter.ai/api/v1/chat/completions` | `OPENROUTER_API_KEY` | `google/gemini-2.5-pro` | OpenAI-compatible chat-completions wire format. One bill for many providers. |
+| `gemini` | `generativelanguage.googleapis.com/v1beta/models/...` | `GEMINI_API_KEY` | `gemini-2.5-pro` | Google AI Studio's `generateContent` endpoint. Slightly lower latency (one less hop). |
+| `ollama` | `{OLLAMA_HOST}/v1/chat/completions` (default `http://localhost:11434`) | none — local server | `qwen3-coder:30b` | Local LLM via [Ollama](https://ollama.com). OpenAI-compatible endpoint. No API key, no token costs, code never leaves the machine. CPU inference is slower than cloud (1–5 min/review typical). |
+
+**Known gotcha for the Gemini API path:** the free tier has zero per-day quota for `gemini-2.5-pro` as of the time of writing. You'll see HTTP 429 immediately. Workarounds:
+
+1. `--model gemini-2.5-flash` — the free tier does allow flash, and it's ~3× faster anyway. Quality drops a bit but the review structure is still solid.
+2. Upgrade to a paid Google AI Studio plan if you want pro via the direct API.
+3. Use `--provider openrouter` — OpenRouter has its own pro quota and bills you directly.
+
+Set a default per-environment via `$CODE_REVIEW_PROVIDER=gemini` in your `.env` so you don't have to pass `--provider` every invocation.
+
+### When to use the Ollama (local) provider
+
+Empirically, local and cloud have **different failure modes**, not strictly better/worse:
+
+- **Cloud** (`openrouter`, `gemini`): faster, more structured output, but hallucinations can slip past at the default temperature. Observed during integration testing: `google/gemini-2.5-pro` produced a confidently-worded HIGH-severity finding that referenced a CLI flag and "help text" that did not exist in the codebase. The suggested fix would have crashed the runner. Verify each finding against actual code before accepting.
+- **Local** (`ollama`): slower (CPU-bound on consumer hardware), sparser output, but won't typically invent findings that contradict the diff. On the same diff that produced the cloud hallucination above, the local model returned a clean "no issues" — possibly missing real nits, but not making up new ones.
+
+Practical workflow: cloud first for the structured triage pass; local as a sanity check or for offline / sensitive / cost-free work. The integration testing run that prompted this section is documented in the runbook.
+
+### Setting up the Ollama provider
+
+1. Install [Ollama](https://ollama.com/download). Runs as a service on `http://localhost:11434`. On Windows, if Smart App Control / Application Control blocks the native installer, install in WSL2 instead — the runner reaches the WSL server via localhost mirroring without any extra config.
+2. Pull at least one model. The default expected by the runner is **`qwen3-coder:30b`** (30B Mixture-of-Experts coder with ~3.3B active parameters — the quality/speed sweet spot on CPU because active-params drive inference speed, not total-params):
+
+    ```bash
+    ollama pull qwen3-coder:30b
+    ```
+
+   Alternative: `qwen3-coder-next` (80B/3B MoE, higher quality, ~40 GB download).
+
+3. (Optional) Override defaults via `.env`:
+   - `OLLAMA_HOST=http://localhost:11434` — server URL. Override if Ollama is on a non-default port, another machine, or a WSL distro without localhost mirroring.
+   - `OLLAMA_MODEL=qwen3-coder:30b` — default model when `--provider ollama` is selected.
+   - `OLLAMA_TIMEOUT=1800` — HTTP timeout in seconds. Default 30 minutes accommodates CPU cold-starts (10–60 s model load) plus thorough reviews; lower it if you'd rather fail fast.
+4. Run:
+
+    ```bash
+    uv run review.py --provider ollama --base origin/main
+    ```
+
+The runner's error messages are tailored for Ollama-specific failure modes — "server unreachable" suggests `ollama serve` and `--ollama-host`; "model not pulled" gives you the exact `ollama pull <model>` command to run.
+
+### Model aliases
+
+The `--model` flag accepts any provider-native slug, plus a curated set of short aliases scoped per provider. An alias is only valid for its declared provider; using one with the wrong `--provider` raises a typed `CONFIG` error pointing at the correct one rather than silently sending an invalid model name upstream.
+
+**OpenRouter aliases** (use with `--provider openrouter`, the default):
+
+| Alias | Resolves to | Notes |
+|---|---|---|
+| `pro` / `gemini-pro` | `google/gemini-2.5-pro` | Current default. |
+| `flash` / `gemini-flash` | `google/gemini-2.5-flash` | ~3× faster than pro, some quality loss. |
+| `claude` / `claude-sonnet` | `anthropic/claude-sonnet-4.5` | Great as a second-opinion reviewer alongside a Gemini round. |
+| `claude-opus` | `anthropic/claude-opus-4.5` | Larger model; slower and pricier. |
+| `gpt` | `openai/gpt-5` | Useful for an independent third opinion. |
+| `gpt-mini` | `openai/gpt-5-mini` | Cheaper / faster GPT. |
+| `deepseek` | `deepseek/deepseek-chat-v3.1` | Cheap, surprisingly strong on code review. |
+
+**Ollama aliases** (use with `--provider ollama`):
+
+| Alias | Resolves to | Notes |
+|---|---|---|
+| `local` | `qwen3-coder:30b` | Current Ollama default. 30B MoE with ~3.3B active params — the recommended quality/speed balance on CPU. |
+| `local-pro` | `qwen3-coder-next` | 80B/3B MoE. Higher quality at the cost of ~40 GB download + slightly slower active path. |
+
+**The `gemini` (direct-API) provider has no aliases** — it takes bare Gemini model names only (e.g. `gemini-2.5-pro`, `gemini-2.5-flash`). Passing an OpenRouter alias like `claude` with `--provider gemini` errors out clearly.
+
+Raw provider-native slugs still pass through unchanged, so anything OpenRouter serves or anything you have pulled into Ollama — including newer models that haven't earned an alias yet — works via `--model <slug>`.
+
+## Modes
+
+| Flag | What it does |
+|---|---|
+| *(none)* | Diff: current branch vs `origin/HEAD` merge-base — matches the upstream `gemini-cli` `/code-review` shape. |
+| `--base main` | Diff: current branch vs an explicit base ref (**includes uncommitted changes**, so iterative-review loops work without committing each pass). |
+| `--pr <N>` | Diff: pulls a GitHub PR diff via `gh pr diff` (requires `gh auth login`). |
+| `--staged` | Diff: staged changes only — good for pre-commit reviews. |
+| `--codebase` | Whole codebase: bundles tracked files (via `git ls-files`) and reviews them all. Narrow with `--include` / `--exclude` glob flags. |
+
+`--model <slug>` overrides the default model. Use `google/gemini-2.5-flash` / `flash` (OpenRouter) or `gemini-2.5-flash` (Gemini API) for ~3× faster reviews with some quality loss.
+
+Two more runtime knobs:
+
+- `--temperature <float>` (default `0.3`, env `CODE_REVIEW_TEMPERATURE`): sampling randomness. Higher = more findings per call, more hallucinations. Lower = tighter, fewer findings.
+- `--max-tokens <int>` (default `16000`, env `CODE_REVIEW_MAX_TOKENS`): ceiling on output. Not a target — you pay only for what's emitted. Default avoids mid-finding truncation on thorough reviews.
+
+The temperature default has been retuned twice. **0.2** (original) was too conservative: 1–2 findings per round on diffs that plausibly contained more, 5–7 rounds to converge. **0.5** surfaced more findings per round (3–5 typical) but produced a clear hallucination during cross-model integration testing — `google/gemini-2.5-pro` returned a confidently-worded HIGH-severity finding that referenced a CLI flag and "help text" that did not exist in the code; the suggested fix would have crashed the runner. **0.3** (current) is the compromise: tight enough to cut hallucination, loose enough to keep "more findings than 0.2." Override per call with `--temperature` if your project benefits from a different setting; empirical re-tuning is encouraged.
+
+### Whole-codebase mode (`--codebase`)
+
+For "audit this repo I just inherited" or "find bugs in code none of us touched in this PR," the diff modes don't help. `--codebase` bundles every tracked file (filtered) into a single payload and reviews them as a whole.
+
+```bash
+uv run review.py --codebase                              # everything tracked, minus built-in noise filters
+uv run review.py --codebase --include 'backend/**/*.py'  # narrow to a directory + extension
+uv run review.py --codebase --exclude '**/test_*'        # widen then narrow
+uv run review.py --codebase --model claude               # use Claude as the codebase reviewer
+```
+
+File selection pipeline:
+
+1. `git ls-files` → all tracked files (so `.gitignore` already filters `node_modules`, `.venv`, build artifacts).
+2. Apply user `--include` globs if any.
+3. Apply user `--exclude` globs.
+4. Apply built-in defensive excludes: lock files, minified output, common binary extensions (`*.png`, `*.svg`, `*.woff`, etc.), `*/dist/*`, `*/build/*`.
+5. Drop individual files larger than 100 KB (logged on stderr — they're usually data fixtures or vendored blobs).
+
+A bundle cap (700,000 chars, ~175 K tokens at the standard 4-chars-per-token estimate) is enforced pre-flight; if the bundle is too large the runner exits with the 10 largest files in the current selection so you can target `--exclude` flags effectively rather than paying for a request that would fail mid-flight on the smaller-context models. The cap is conservative against both Gemini 2.5 Pro (1 M-token context) and Claude Sonnet 4.5 (200 K-token context), so the same selection works regardless of which model you target with `--model`.
+
+Output is the same severity-tagged per-file findings shape as diff mode (see the **Output format** section below — the diff-mode template applies, with the per-file section anchoring to the bundle's `======== FILE: <path> ========` delimiters instead of a diff). The **architectural-summary output shape** (high-level "patterns / structure / smells" section preceding the per-file findings) is an explicit TODO; the trade-offs (hallucination risk on architectural takes, less actionable output, token-budget contention) are documented in the runbook under "Future modes."
+
+## Output format
+
+Markdown, structured per the upstream `commands/code-review.toml` template (diff modes) or the fork-added `commands/codebase-review.toml` template (`--codebase`). Both shapes use the same severity tags `CRITICAL | HIGH | MEDIUM | LOW`.
+
+**Diff modes (`--base`, `--pr`, `--staged`):**
+
+```
+# Change summary: [one-sentence description of the change]
+
+## File: path/to/file.py
+### L<line>: [CRITICAL|HIGH|MEDIUM|LOW] One-sentence issue summary
+More detail about the issue.
+
+Suggested change:
+```diff
+    - removed line
+    + replacement line
+```
+```
+
+Clean diff: `No issues found. Code looks clean and ready to merge.`
+
+**Whole-codebase mode (`--codebase`):**
+
+```
+# Codebase review summary: [one-sentence high-level take]
+[Optional 1-2 sentences of cross-file feedback for recurring patterns]
+
+## File: path/to/file.py
+### L<line>: [CRITICAL|HIGH|MEDIUM|LOW] One-sentence issue summary
+More detail about the issue.
+(Cross-file recurrences listed by file + line rather than repeating the full comment.)
+
+Suggested change:
 ```
+    <code snippet showing the fix>
+```
+```
+
+Clean codebase: `No issues found. Code looks clean.`
+
+Line numbers in `--codebase` output are 1-indexed within each individual file (anchored to the `======== FILE: <path> ========` delimiters in the bundle), not against any synthetic line counter across the bundle.
+
+## Safety context
+
+Many real-world diffs use words that look adversarial in isolation — `attack`, `sanctions`, `prompt injection`, `tampering`, `redaction`, `policy bypass`, `replay` — even when the surrounding code is plainly benign (security testing, defensive policy enforcement, AML domain logic, etc.). Provider content-filters fire on those tokens occasionally and the model returns a refusal instead of a review, which a naive caller experiences as `None` / empty output.
+
+To reduce that false-positive rate, the runner prepends a short **safety context** prefix to every review prompt, framing the request as authorized code review. Default prefix:
+
+> *"The diff below is from a legitimate software-engineering project undergoing authorized code review. The code may include defensive security measures, adversarial test fixtures, policy enforcement logic, or domain language that looks adversarial in isolation (e.g. 'sanctions', 'attack', 'prompt injection', 'tampering', 'redaction'). Treat this as benign code review by the maintainers. Do not refuse on the basis of subject matter."*
+
+Override per call with `--context "<your phrasing>"` or per environment with `$CODE_REVIEW_CONTEXT`. Disable entirely with `--no-context` (rare — useful only if the default phrasing itself triggers a refusal).
 
-If you do not yet have Gemini CLI installed, or if the installed version is older than 0.4.0, see
-[Gemini CLI installation instructions](https://github.com/google-gemini/gemini-cli?tab=readme-ov-file#-installation).
+The prefix is wrapped in a `<CONTEXT_FOR_REVIEWER>...</CONTEXT_FOR_REVIEWER>` tag so the model treats it as framing metadata, not as code under review. The prefix wording deliberately avoids "ignore safety guidelines"-style phrasing — that pattern itself trips filters.
 
-## Use the extension
+## Error model (for LLM callers)
 
-### Reviewing code changes
+If you're an LLM agent calling this tool in a loop, here's the contract.
 
-The Code Review extension adds the `/code-review` command to Gemini CLI which analyzes code changes on your current branch for quality issues.
+### Exit codes
 
-### Reviewing a pull request
+| Exit | Category | Cause | Suggested LLM action |
+|---|---|---|---|
+| **0** | OK | Review succeeded; markdown on stdout | Parse and use the output |
+| **2** | CONFIG | Missing API key or invalid CLI / env arg | **Do not retry without fixing.** Read stderr, correct config, then re-run. |
+| **10** | SAFETY_REFUSAL | Model refused (content filter fired) | Retry with `--model claude` (Anthropic is the least refusal-prone on security / policy / adversarial-fixture code). If refused across models, the content may need human review. |
+| **11** | RATE_LIMIT | HTTP 429 from the provider | Wait 30–60s, then retry. If the limit is per-key per-day (common on free tiers), switch `--provider` or `--model`. |
+| **12** | CONTEXT_OVERFLOW | Diff exceeded the model's token budget or the runner's 700K-char bundle cap | Narrow scope: `--include` / `--exclude` in codebase mode, or a smaller `--base` ref in diff mode. **Do not retry without reducing scope.** |
+| **13** | PROVIDER_HICCUP | Null content with no clear cause (no safety flag, no length hit) | The runner already auto-retried once; if you see this, both attempts failed. Wait a few seconds and retry; if still hicupped, switch provider. |
+| **14** | TRANSPORT | HTTP 5xx, timeout, or connection error | The runner already retried once at 2s. Retry with exponential backoff (4s, 8s); escalate if 3 retries fail. |
+| **1** | UNKNOWN | Catchall (unexpected exception, non-JSON response, etc.) | Read stderr for the surviving exception; escalate if unclear. |
 
-The Code Review extension adds the `/pr-code-review` command to Gemini CLI which analyzes code changes on your pull request for quality issues.
+### Stderr format
+
+Stable single-line prefix for machine parsing:
+
+```
+ERROR: <CATEGORY> [exit <N>]
+```
+
+Followed by free-form lines for human readability:
+
+```
+ERROR: SAFETY_REFUSAL [exit 10]
+Reason: Model refused with finish_reason='safety'
+Model: google/gemini-2.5-pro
+Provider: openrouter
+Suggested: Retry with a different model: ``--model claude`` is the most refusal-resistant...
+Detail: {"choices":[{"message":{"content":null,...}}],...}
+```
+
+An agent can `grep -oE '^ERROR: [A-Z_]+'` the stderr to extract the category cheaply.
+
+### Auto-retry behavior
+
+The runner auto-retries **once** on:
+
+- `PROVIDER_HICCUP` — usually recovers on the second call
+- `TRANSPORT` — HTTP 5xx / network timeout
+
+Other categories surface immediately:
+
+- `SAFETY_REFUSAL` — a second call with the same model + prompt almost always reproduces the refusal; better to switch model than burn tokens
+- `RATE_LIMIT` — retry without waiting just makes it worse
+- `CONTEXT_OVERFLOW` — the scope is wrong, not the call
+- `CONFIG` — fix the config
+
+### Decision tree for LLM callers
+
+```
+review.py exited:
+├── 0   → review succeeded; markdown on stdout
+├── 2   → check config; do NOT retry without changes
+├── 10  → retry with --model claude; if still 10, escalate to human
+├── 11  → wait 60s and retry; if still 11, switch --provider
+├── 12  → narrow scope (--include/--exclude or smaller --base); do NOT retry without scope change
+├── 13  → runner already retried once; retry yourself once after a few seconds; if still 13, switch provider
+├── 14  → exponential backoff retry (4s, 8s, 16s); escalate after 3 attempts
+└── 1   → read stderr; escalate
+```
+
+## Operational runbook
+
+The full workflow — iteration loop, accept/decline heuristics, the decline-comment contract, known gotchas, per-round tracking — lives in [`docs/llm-code-review-runbook.md`](./docs/llm-code-review-runbook.md). Same content works for human developers and LLM agents alike.
+
+## Why this exists
+
+The GitHub Gemini Code Assist bot is excellent at finding real concurrency, security, and correctness bugs — but it lives behind a GitHub webhook that calls a Google job queue, and the wall-time latency makes iterative cycles painful when you want a review every few commits. Running the same prompts locally via OpenRouter or the Gemini API cuts the loop from minutes to seconds and works offline from GitHub.
+
+In practice this turns the workflow from "push → wait → fix → repeat" into "stage → review locally → fix → stage → review locally → commit when clean," with the GitHub bot reserved as a final-mile verification pass instead of an iteration partner. The Apache-2.0 license on the upstream extension permits this kind of derivative work.
+
+A 10-iteration test against a ~5K-line PR caught 3 HIGH-severity correctness bugs (infinite session-creation loop, header-overwrite latent bug, Map-collision UX bug) plus several MEDIUM-severity defensive-coding issues — all in ~10 minutes wall-time. The same review through the GitHub bot would have taken ~50 minutes for the equivalent rounds.
+
+## Upstream sync
+
+When upstream ships changes to the skill or command prompts:
+
+```bash
+git fetch upstream
+git checkout main && git merge upstream/main
+```
+
+Because the runner doesn't touch `skills/` or `commands/`, conflicts are unlikely.
+
+---
+
+## Original upstream README
+
+The Code Review extension is an open-source Gemini CLI extension, built to enhance your repository's code quality. The extension adds a new command to Gemini CLI that analyzes code changes to identify a variety of code quality issues.
+
+This extension is brought to you by the authors of the [Gemini Code Assist GitHub App](https://github.com/apps/gemini-code-assist), which provides code reviews directly in your GitHub pull requests.
+
+### Installation (upstream CLI mode)
+
+If you want to install the upstream extension into `gemini-cli` (separate from the standalone runner this fork ships), point at the **upstream repository**, not this fork:
+
+```bash
+gemini extensions install https://github.com/gemini-cli-extensions/code-review
+```
 
-To use this extension for a pull request, you need to [enable](https://github.com/google-gemini/gemini-cli/blob/main/docs/tools/mcp-server.md) the [github mcp server](https://github.com/github/github-mcp-server), and provide pull request information. You can either provide through `/pr-code-review link/to/pull/request` or by [configuring](https://github.com/google-gemini/gemini-cli/blob/main/docs/reference/configuration.md#environment-variables-and-env-files) the following environment variables:
-- `REPOSITORY`: The github repository which contains the pull request.
-- `PULL_REQUEST_NUMBER`: The pull request number that need the code review.
-- `ADDITIONAL_CONTEXT`: Additional context or specific area that should focus on.
+If you do not yet have Gemini CLI installed, or if the installed version is older than 0.4.0, see [Gemini CLI installation instructions](https://github.com/google-gemini/gemini-cli?tab=readme-ov-file#-installation).
 
-## Resources
+### Use the extension (upstream CLI mode)
 
-- [Gemini CLI extensions](https://github.com/google-gemini/gemini-cli/blob/main/docs/extensions/index.md): Documentation about using extensions in Gemini CLI
-- [Blog post](https://blog.google/technology/developers/gemini-cli-extensions/): Announcement of Gemini CLI Extensions
-- [GitHub issues](https://github.com/gemini-cli-extensions/code-review/issues): Report bugs or request features
+The Code Review extension adds the `/code-review` command to Gemini CLI which analyzes code changes on your current branch for quality issues. See the [upstream docs](https://github.com/gemini-cli-extensions/code-review) for `/pr-code-review` and MCP setup.
 
 ## Legal
 
-- License: [Apache License 2.0](https://github.com/gemini-cli-extensions/code-review/blob/main/LICENSE)
+- License: [Apache License 2.0](./LICENSE) (inherited from upstream)
+- Upstream: <https://github.com/gemini-cli-extensions/code-review>
diff --git a/commands/codebase-review.toml b/commands/codebase-review.toml
new file mode 100644
index 0000000..672242b
--- /dev/null
+++ b/commands/codebase-review.toml
@@ -0,0 +1,44 @@
+description = "Reviews the whole tracked codebase (or a filtered subset) as a single bundle"
+prompt = """
+<CONTEXT>
+    **Codebase**: see the bundle below. Each file is delimited by a line of the exact form `======== FILE: <path> ========`. Treat the content between two delimiters as the file's contents. **Each content line is prefixed with its 1-indexed line number followed by `: ` (e.g. `   808: def foo():`).** When you cite a line in your findings, transcribe the prefix value verbatim — do not count lines yourself. The line number is the position in that specific file's content (1-indexed at the line immediately following the delimiter), not a position in the overall bundle.
+
+<CODEBASE_BUNDLE_PLACEHOLDER>
+</CONTEXT>
+
+<PROTOCOL>
+Activate the `code-review-codebase` skill for persona, objective, instructions and critical constraints. Then follow the exact structure and rules in the `<OUTPUT>` section.
+</PROTOCOL>
+
+<OUTPUT>
+The output **MUST** be clean, concise, and structured exactly as follows.
+
+**If no issues are found:**
+
+# Codebase review summary: [Single sentence high-level take on the codebase].
+No issues found. Code looks clean.
+
+**If issues are found:**
+
+# Codebase review summary: [Single sentence high-level take on the codebase].
+[Optional 1-2 sentence general feedback for the whole codebase, e.g. a recurring pattern that does not fit a single file.]
+
+## File: path/to/file/one
+### L<LINE_NUMBER>: [<SEVERITY>] Single sentence summary of the issue.
+
+More details about the issue, including why it is an issue (e.g., "This could lead to a null pointer exception").
+
+Suggested change:
+```
+    <code snippet showing the proposed fix; preserve the surrounding
+    indentation so the suggestion can be pasted in directly>
+```
+
+### L<LINE_NUMBER_2>: [MEDIUM] Summary of the next problem.
+More details about this problem, including where else it occurs if applicable (e.g., "Also seen at backend/foo.py L18 and backend/bar.py L42.").
+
+## File: path/to/file/two
+### L<LINE_NUMBER_3>: [HIGH] Summary of the issue in the next file.
+Details...
+</OUTPUT>
+"""
diff --git a/docs/llm-code-review-runbook.md b/docs/llm-code-review-runbook.md
new file mode 100644
index 0000000..82d2cd9
--- /dev/null
+++ b/docs/llm-code-review-runbook.md
@@ -0,0 +1,357 @@
+# LLM code-review runbook
+
+Operational guide for using `review.py` as an iteration partner during code work. Same workflow whether you're a human developer or a coding agent (Claude, Codex, etc.).
+
+The runner is a thin Python wrapper around the upstream `gemini-cli-extensions/code-review` skill + command prompts. It POSTs them to OpenRouter, the Gemini API, or a local Ollama server (offline / no API key / no token cost), and prints structured-markdown findings — same prompts, same output shape as the GitHub `/gemini review` bot, without the 5–15 min webhook → job-queue wait.
+
+---
+
+## Setup
+
+```
+Repository:    https://github.com/Airwhale/local-gemini-code-review
+Entry point:   review.py at the repo root
+Dependencies:  uv-managed -- first run installs them
+Config:        .env at the runner's repo root (NOT at the project being reviewed)
+```
+
+Set the key for whichever cloud provider you'll use; missing keys fail fast with a clear error. The local provider needs no key — just a running Ollama server.
+
+- `OPENROUTER_API_KEY` — default provider
+- `GEMINI_API_KEY` — direct Google AI Studio path
+- *(none)* — local Ollama. Requires `ollama serve` running and at least one model pulled.
+
+Optional environment variables:
+
+- `CODE_REVIEW_PROVIDER` — set provider default (e.g. `ollama`)
+- `OPENROUTER_MODEL` / `GEMINI_MODEL` / `OLLAMA_MODEL` — override the per-provider default model
+- `OLLAMA_HOST` — Ollama server URL (default `http://localhost:11434`). Override for non-default ports, remote Ollama, or WSL distros without localhost mirroring.
+- `OLLAMA_TIMEOUT` — HTTP timeout for Ollama calls in seconds (default `1800`, i.e. 30 minutes — accommodates CPU cold-starts and thorough reviews).
+
+---
+
+## Invocation
+
+From any project directory:
+
+```bash
+cd /path/to/your-project
+uv run --project /path/to/local-gemini-code-review /path/to/local-gemini-code-review/review.py --base origin/main
+```
+
+Or from the runner directory against an external CWD:
+
+```bash
+cd /path/to/local-gemini-code-review
+uv run review.py --pr 42
+```
+
+The runner reads `.env` from its own directory — configure once, invoke from anywhere.
+
+### Source modes (mutually exclusive)
+
+| Flag                 | Source scope                                                   | Use when                                |
+|----------------------|----------------------------------------------------------------|-----------------------------------------|
+| *(none)*             | diff: merge-base against `origin/HEAD`                         | quick check on current branch           |
+| `--base origin/main` | diff: two-dot diff vs ref, **includes working tree**           | iterating before commit                 |
+| `--pr <N>`           | diff: `gh pr diff N` (requires `gh auth login`)                | reviewing an existing PR                |
+| `--staged`           | diff: staged-only                                              | pre-commit hook style                   |
+| `--codebase`         | whole codebase: `git ls-files` bundled (filtered); see below   | auditing an unfamiliar repo             |
+
+### Providers
+
+| `--provider`    | Env key required        | Default model            | Notes                                                                                   |
+|-----------------|-------------------------|--------------------------|-----------------------------------------------------------------------------------------|
+| `openrouter` *  | `OPENROUTER_API_KEY`    | `google/gemini-2.5-pro`  | Default. Reliable quota; recommended for iterative work.                                |
+| `gemini`        | `GEMINI_API_KEY`        | `gemini-2.5-pro`         | Direct to Google AI Studio. **Free tier has zero quota for pro** — use flash if free.   |
+| `ollama`        | *(none — local)*        | `qwen3-coder:30b`        | Offline / no API key / no token cost. CPU inference is slower (1–5 min per review typical). Different failure mode than cloud — see "Local vs cloud" below. |
+
+\* default
+
+### Local vs cloud — pick deliberately
+
+Empirically, the two paths fail in **opposite directions**. Cloud models hallucinate at higher temperatures; local models under-report. Concrete evidence from integration testing on a 31 K-char diff:
+
+- **`google/gemini-2.5-pro` via OpenRouter at `temperature=0.5`** returned a HIGH-severity finding that referenced a CLI flag (`--timeout`) and "help text" that did **not** exist in the codebase. The suggested fix would have crashed the runner with `AttributeError`. Confident, structured, plausible — and entirely fabricated.
+- **`qwen3-coder:30b` via local Ollama on the same diff** returned a clean "no issues found." Likely missed legitimate nits (type narrowing, dead code, etc.) but did not invent any.
+
+The temperature default was lowered from `0.5` to `0.3` (current) partly in response to that hallucination. Even at `0.3`, treat every finding as a hypothesis to verify, not an instruction.
+
+**Use cloud when:** you want fast, structured triage; iterating against a small diff; converging in fewer rounds matters more than zero false positives.
+
+**Use Ollama (local) when:** you're offline; the code is sensitive enough you don't want it leaving the machine; you want a free sanity check; the cloud providers are rate-limited / down; you want a second-opinion pass after a cloud review (different blind spots catch different things).
+
+**For high-stakes PRs**, run *both* and reconcile.
+
+### Setting up Ollama (when `--provider ollama` is right)
+
+1. Install [Ollama](https://ollama.com/download). If Smart App Control / Application Control blocks the installer on Windows, install in WSL2 — the runner reaches the WSL server via localhost mirroring without extra config.
+2. Pull a model. Default expected by the runner:
+
+    ```bash
+    ollama pull qwen3-coder:30b
+    ```
+
+    `qwen3-coder:30b` is a 30B MoE with ~3.3B active parameters — the quality/speed sweet spot on CPU because active-param count drives inference speed, not total. Use the `local-pro` alias (`qwen3-coder-next`) for a larger MoE if disk + quality budget allows.
+
+3. Verify the server is reachable from the runner's host:
+
+    ```bash
+    curl http://localhost:11434/api/tags
+    ```
+
+    Should return a JSON list including the model you pulled.
+
+4. Run the review:
+
+    ```bash
+    uv run review.py --provider ollama --base origin/main
+    ```
+
+The runner's Ollama-specific errors are surgical: a connection refusal raises `ConfigError` (exit 2) suggesting `ollama serve`; a 404 on the model raises `ConfigError` (exit 2) with the exact `ollama pull <model>` command to run. No retries on these — they're configuration problems the caller has to fix.
+
+### Model selection
+
+`--model <slug>` overrides the default. Aliases are **scoped per provider** — each one only resolves under its declared `--provider`. Passing an alias to the wrong provider raises a typed `CONFIG` error naming the right one rather than sending an invalid model name to the upstream API.
+
+**OpenRouter aliases** (use with `--provider openrouter`, the default):
+
+| Alias | Resolves to | Notes |
+|---|---|---|
+| `pro` / `gemini-pro` | `google/gemini-2.5-pro` | Current default. |
+| `flash` / `gemini-flash` | `google/gemini-2.5-flash` | ~3× faster than `pro` with some quality loss — use during heavy iteration, switch to `pro` for the final pass. |
+| `claude` / `claude-sonnet` | `anthropic/claude-sonnet-4.5` | Great as a second-opinion reviewer alongside a Gemini round. |
+| `claude-opus` | `anthropic/claude-opus-4.5` | Larger model; slower and pricier. |
+| `gpt` | `openai/gpt-5` | Independent third opinion. |
+| `gpt-mini` | `openai/gpt-5-mini` | Cheaper / faster GPT. |
+| `deepseek` | `deepseek/deepseek-chat-v3.1` | Cheap, surprisingly strong on code review. |
+
+**Ollama aliases** (use with `--provider ollama`):
+
+| Alias | Resolves to | Notes |
+|---|---|---|
+| `local` | `qwen3-coder:30b` | Current Ollama default. 30B MoE with ~3.3B active params — the recommended quality/speed balance on CPU. |
+| `local-pro` | `qwen3-coder-next` | 80B/3B MoE. Higher quality at the cost of ~40 GB download and slightly slower active path. |
+
+**The `gemini` (direct-API) provider has no aliases** — it takes bare Gemini model names only (e.g. `gemini-2.5-pro`, `gemini-2.5-flash`). Raw provider-native slugs always pass through unchanged, so anything OpenRouter serves or anything pulled into Ollama — including newer models without aliases yet — works via `--model <slug>`.
+
+### Tuning sampling: `--temperature` and `--max-tokens`
+
+Two runtime knobs control how much the model says and how exploratory it is:
+
+| Flag | Default | What it controls | When to change |
+|---|---|---|---|
+| `--temperature <float>` | `0.3` (env: `CODE_REVIEW_TEMPERATURE`) | Sampling randomness. Higher = more exploration, more findings per call, more hallucinations. Lower = tighter, more conservative, fewer findings. | Drop to `0.2` for security-critical PRs where decline-comment overhead is expensive. Raise to `0.5–0.7` for first-pass audits where you want maximum coverage and can afford the false-positive rate. |
+| `--max-tokens <int>` | `16000` (env: `CODE_REVIEW_MAX_TOKENS`) | Ceiling on output token count. Not a target — you pay only for what the model actually emits. Default ensures a thorough review isn't truncated mid-finding. | Rarely needs changing. Drop to `4000` if you genuinely want short, focused output (e.g. CI-step where only critical findings matter). Raise if you see truncated `Suggested change:` blocks in very large reviews. |
+
+The temperature default has been retuned twice based on empirical observation:
+
+- **`0.2`** (original): too conservative — 1–2 findings per round on diffs that plausibly contained more, requiring 5–7 rounds to converge.
+- **`0.5`** (raised in response to the above): more findings per round (3–5 typical), but during cross-model integration testing `google/gemini-2.5-pro` produced a HIGH-severity finding that referenced a CLI flag (`--timeout`) and quoted "help text" that did not exist in the codebase. The proposed fix would have crashed the runner with `AttributeError: 'Namespace' object has no attribute 'timeout'`. Confident, well-formatted, and a hallucination.
+- **`0.3`** (current): tight enough to cut the hallucination rate, loose enough to keep "more findings than 0.2." If your project shows different behavior, retune; the constant in `review.py` documents the history so the next maintainer can see the evidence.
+
+### Whole-codebase mode (`--codebase`)
+
+For situations that diff review can't help with — auditing a repo you just inherited, finding bugs in code none of your PRs have touched — `--codebase` bundles every tracked file (filtered) and reviews them as a single payload.
+
+```bash
+uv run review.py --codebase                              # all tracked files, minus built-in noise
+uv run review.py --codebase --include 'backend/**/*.py'  # narrow to a directory + extension
+uv run review.py --codebase --exclude '**/test_*'        # widen then narrow
+uv run review.py --codebase --model claude               # use Claude as the codebase reviewer
+```
+
+Selection pipeline: `git ls-files` → user `--include` (if any) → user `--exclude` → built-in defensive excludes (lock files, minified bundles, binary asset extensions, `*/dist/*`, `*/build/*`) → drop individual files >100 KB (logged on stderr).
+
+Pre-flight bundle cap is 700 K chars (~175 K tokens — conservative against both Gemini 2.5 Pro's 1 M context and Claude Sonnet 4.5's 200 K context). If the bundle exceeds the cap, the runner exits with the 10 largest files in the current selection so you can target `--exclude` effectively rather than paying for a request that would fail mid-flight.
+
+Output is the same severity-tagged per-file findings format as diff mode — same accept/decline heuristics apply.
+
+---
+
+## Iteration loop
+
+```
+1. Edit the target repo (fix a bug, build a feature, etc.).
+2. (If reviewing untracked files in codebase mode: `git add -N <paths>`
+   first so `git ls-files` sees them. No staged content; just makes
+   them visible to the bundler.)
+3. Run:  uv run --project <runner> <runner>/review.py --base origin/main
+4. Read the structured-markdown output. Findings are tagged
+   CRITICAL > HIGH > MEDIUM > LOW.
+5. For each finding, decide: accept or decline.
+6. Apply accepted fixes inline. Do NOT commit yet.
+7. Re-run step 3.
+8. Repeat until ONE of:
+   - Output is "No issues found. Code looks clean..."
+   - A round produces only hallucinated findings (decline-only round)
+   - You have hit 4 rounds and remaining findings are stylistic noise
+9. Run tests + build. Commit. Push.
+```
+
+**Do not commit between rounds.** `--base origin/main` uses a two-dot diff (`git diff -U5 <base>`), which includes working-tree edits. Re-running picks up in-progress fixes immediately. Committing every round produces noisy history; the reviewer is happy reviewing uncommitted edits.
+
+**The 4-round cap is a pragmatic ceiling, not a hard rule.** Empirically, real bugs surface in rounds 1–3 and round 4 is usually either clean or "all hallucinations." If round 4 is still producing substantive findings, the diff is too large — split it.
+
+**Codebase mode requires tracked files.** The runner uses `git ls-files` for safety (so it doesn't accidentally review `.venv/`, build artefacts, etc.). For new code that hasn't been committed yet, `git add -N <paths>` marks files as intent-to-add — they appear in `git ls-files` but their content stays unstaged. This is the cleanest way to get a review on work-in-progress without committing prematurely.
+
+---
+
+## Accept / decline heuristics
+
+**Accept by default:**
+
+- **CRITICAL / HIGH** unless the finding is factually wrong (rare). Real correctness or security bugs.
+- **MEDIUM** about correctness, concurrency, atomicity, latent bugs, schema or type safety.
+- **MEDIUM** about defensive coding — wider exception catches, header normalization, partial-key cache invalidation. Cheap hardening.
+- **LOW** when trivially right: consolidating duplicates, fixing typos, tightening assertions, correcting inaccurate comments.
+
+**Decline (and add a code comment explaining why):**
+
+- Findings that contradict load-bearing design intent already encoded. *Shape:* a pair of API endpoints that look redundant but are deliberately split for a planned future migration. The reviewer doesn't know your roadmap unless your code comments tell it.
+- Findings that would untighten a deliberately-tight test. *Shape:* a hardcoded expected count the reviewer wants computed dynamically from the fixture — that turns the test into a tautology (*what the code reads == what the code reports*) and removes the regression-catch.
+- Stylistic preferences that don't change correctness when the existing form has a defensible rationale. *Shape:* a cache `maxsize` chosen with deliberate headroom for test fixtures rather than the obvious-singleton value.
+- **Hedged findings ("if X is true, this is HIGH/CRITICAL").** The reviewer hedges when it lacks evidence to fully evaluate a hypothesis. In practice these resolve as false positives more often than as real bugs. *Shape:* "If `approved_body_hash` doesn't exclude `route_approval`, this is CRITICAL" — verify the premise (`approved_body_hash` does exclude it via an explicit `APPROVED_BODY_HASH_EXCLUDES` constant) before acting. Treat the hedge as a flag to spot-check the premise, not as an instruction.
+- **Self-refuting findings.** When a finding's own "better fix" matches the existing code, the reviewer hallucinated a bug it then walked back. *Shape:* "Wrong type at L772 — a better fix would be to pass the list of vectors directly" when the actual line (which may not even be L772) already does exactly that. Decline without action; no code comment needed because the next round won't re-flag the same hallucination consistently.
+
+---
+
+## The decline contract
+
+**Every declined finding gets a code comment immediately adjacent to the flagged code**, explaining why the suggestion was rejected. Without the comment, the next iteration's model will surface the same finding again. The comment is a contract with future review rounds — it's how you teach the reviewer about decisions it cannot infer from the diff alone.
+
+This is the central operational rule. Without it, the loop churns on the same findings indefinitely. With it, the loop converges to clean in a small number of rounds.
+
+---
+
+## Error model
+
+The runner exits with **typed exit codes** so an LLM caller can react differently to each failure mode without parsing prose. The full table + decision tree lives in the README's "Error model (for LLM callers)" section; the short version:
+
+| Exit | Category | Quick action |
+|---|---|---|
+| 0 | OK | use stdout |
+| 2 | CONFIG | fix env / CLI flag; don't retry |
+| 10 | SAFETY_REFUSAL | retry with `--model claude` |
+| 11 | RATE_LIMIT | wait 60s; switch provider if persistent |
+| 12 | CONTEXT_OVERFLOW | narrow scope; don't retry as-is |
+| 13 | PROVIDER_HICCUP | retry; runner already auto-retried once |
+| 14 | TRANSPORT | exponential backoff; escalate after 3 |
+| 1 | UNKNOWN | read stderr; escalate |
+
+Stderr always starts with a stable line `ERROR: <CATEGORY> [exit <N>]` followed by free-form human-readable detail. An agent can grep that prefix to extract the category without parsing.
+
+The runner **auto-retries once** on `PROVIDER_HICCUP` and `TRANSPORT` before exiting; other categories surface immediately because retrying without changes doesn't help.
+
+## Safety context
+
+The runner prepends a short framing prefix to every review request, wrapped in `<CONTEXT_FOR_REVIEWER>...</CONTEXT_FOR_REVIEWER>`, to reduce false-positive content-filter refusals on diffs that contain words like `attack`, `sanctions`, `prompt injection`, `tampering`, `redaction` (common in security testing, AML compliance, policy enforcement code).
+
+Override per call with `--context "<your phrasing>"` or per environment with `$CODE_REVIEW_CONTEXT`. Disable with `--no-context` (rarely useful — the default already drops refusal rate significantly).
+
+See the README's "Safety context" section for the default phrasing.
+
+## Known gotchas
+
+1. **Free-tier 429 on Gemini direct.** `--provider gemini --model gemini-2.5-pro` requires a paid Google AI Studio plan; the free tier returns HTTP 429 immediately (`RATE_LIMIT`, exit 11). Either use `--provider openrouter` (preferred) or `--model gemini-2.5-flash`.
+
+2. **Codebase mode line numbers used to drift.** Before commit `b124501`, the bundle had no per-line anchors and the model estimated line positions from visual context, drifting 5–150 lines depending on file size. As of `b124501` every content line is pre-numbered (`cat -n` style) and the model transcribes the prefix instead of counting. If you ever see drift again on a current build, that's a regression worth investigating — the prompt or bundle format may have been changed in a way that broke the contract.
+
+3. **Two-dot vs three-dot diff.** `--base <ref>` uses two-dot (`git diff -U5 <ref>`) so working-tree changes show up. Three-dot (`<ref>...HEAD`) would show only committed changes and the reviewer would keep re-flagging the same issues. The two-dot semantics is intentional for the iteration workflow.
+
+4. **Diff size shapes round count.** Larger diffs surface more findings per round and take more rounds to converge. Rough observed shapes:
+   - Small PR (~25K-char diff, single feature): 3–4 rounds
+   - Medium PR (~50K chars): 4–6 rounds
+   - Large PR (~300K chars): 8–12 rounds
+
+5. **`tee` to a file.** When output is large, pipe to `tee /tmp/review.md` so you can re-read findings without re-invoking the tool. Saves context budget on subsequent steps.
+
+6. **Codebase-mode bundle cap.** `--codebase` enforces a 700 K-char (~175 K-token) pre-flight cap on the concatenated bundle. If you hit it, the runner exits with `CONTEXT_OVERFLOW` (exit 12) and lists the 10 largest files in the current selection — use those to target `--exclude` flags. Common offenders: vendored fixture JSON, committed schema dumps, test data files.
+
+7. **Ollama cold-start latency.** The first request after starting `ollama serve` (or after a model has been idle for a few minutes) takes 10–60 s extra while the model loads into RAM. Subsequent calls within Ollama's keep-alive window are fast. If you see no output for ~30 s on the first call, that's normal — don't assume the runner hung. `OLLAMA_TIMEOUT` (default 1800 s) covers worst-case CPU cold-start plus a thorough review; lower it if you want faster failure.
+
+8. **Ollama "model not pulled" returns a `CONFIG` error (exit 2), not a 404.** The runner intercepts the 404 from the local server and surfaces it as a typed `CONFIG` error with the exact `ollama pull <model>` command to run. Don't retry without pulling first — retry without changes hits the same error.
+
+9. **Ollama review depth is lower than cloud.** Local models, especially on CPU, tend to under-report findings versus a cloud reviewer on the same diff. This is the inverse of the cloud-hallucinates problem documented under "Local vs cloud." A clean "no issues" from Ollama is **not** equivalent in confidence to a clean review from `claude` or `pro` — treat it as a sanity check, not a final verdict, when stakes are high.
+
+---
+
+## Future modes (TODO)
+
+Two extensions deferred to v2; both have explicit design notes in this section so a future maintainer or LLM agent picking up the project knows the trade-offs already considered.
+
+### Architectural-summary output shape (`--summary`)
+
+A proposed flag that would prepend a high-level "patterns / structure / smells" section to the per-file findings produced by `--codebase`. Useful as orientation on a new-to-you codebase ("here's what this repo is, and here's the shape of its problems").
+
+**Why deferred:** the per-file findings shape that ships today is safer because:
+
+1. **Lower hallucination risk.** Line-level findings either match the code or they don't — directly verifiable. Architectural takes require deep context the model doesn't reliably have, and the model will produce one regardless of whether the codebase actually has the problem it describes.
+2. **More actionable.** Per-file findings translate directly to edits. Architectural observations require human translation, and often the "fix" isn't a code change but a design decision.
+3. **Cheaper token-wise.** A serious architectural section costs 1–2 K output tokens that would otherwise go to concrete findings.
+
+Implementation sketch when the time comes: a separate `--summary` flag (codebase-mode-only) plus a tweak to `commands/codebase-review.toml` to ask for a leading architectural section. The skill prompt likely needs an extra severity bucket (`OBSERVATION`?) or explicit framing that the architectural section is exempt from the "demonstrable bug or improvement" requirement that gates the per-file findings.
+
+### Per-file iteration
+
+If a codebase legitimately exceeds the bundle cap and `--include` filtering would lose too much context, an alternative is one model call per file, aggregating findings client-side. Cheaper per call but more total calls; loses cross-file context the bundle approach preserves. Not in scope until a real user hits the cap on a project they can't reasonably narrow.
+
+---
+
+## When to also call `/gemini review` on GitHub (or run a second provider)
+
+Treat the runner as the **iteration partner** and a second reviewer as the **final-mile verifier**. Options for the verifier:
+
+- **GitHub `/gemini review` bot** — same model family as the default, similar prompts; only advantage is independence ("a third party reviewed this, not my own prompt-following loop").
+- **A different runner provider on the same diff** — cheap and immediate. Cloud and local have different blind spots; running both is the closest thing to a structured "second opinion" you can get without paying for it.
+
+Bring in a verifier when:
+
+- The diff touches concurrency, locks, signing/replay, differential privacy ledgers, policy boundaries, or auth surfaces — anywhere a missed bug has outsized blast radius.
+- The PR is large enough that you want a credibility signal beyond "I ran one model locally."
+- You're about to merge to a protected branch and want one independent confirmation.
+
+**Concrete pattern:** when the cloud reviewer says "no issues," re-run with `--provider ollama` (or vice versa). When they disagree, the disagreement itself is signal — verify the specific finding against the actual code. When they agree on "clean," that's a stronger green light than either alone.
+
+For most PRs, three to five clean local rounds is sufficient and saves 30–45 minutes per PR vs the webhook round-trip.
+
+---
+
+## Per-round tracking
+
+Keep a one-line ledger per round so the final commit message or PR comment can summarize the cycle:
+
+```
+Iter N: <severity> <one-line-description> -> applied | declined-with-comment | clean
+```
+
+After the cycle, the ledger becomes a table in the commit body or PR comment:
+
+```
+| Round | Finding                                            | Action                |
+|-------|----------------------------------------------------|-----------------------|
+| 1     | MED: cache duplicated across modules               | Applied               |
+| 1     | LOW: lru_cache maxsize stylistic nit               | Declined w/ comment   |
+| 2     | MED: CWD-relative path resolution is fragile       | Applied               |
+| 3     | None -- "No issues found. Code looks clean..."     | Ready                 |
+```
+
+This is the artifact a human reviewer reads to understand what changed and why. Don't skip it.
+
+---
+
+## One-liner reference
+
+The command shape used most often during iterative work:
+
+```bash
+uv run --project /path/to/local-gemini-code-review /path/to/local-gemini-code-review/review.py --base origin/main 2>&1 | tee /tmp/review.md
+```
+
+Tail the file in another shell, or pipe to `head -80` if you only want the top findings.
+
+---
+
+## Provenance
+
+This fork keeps the upstream `gemini-cli-extensions/code-review` skill and command prompts byte-identical so upstream improvements rebase cleanly. Fork additions: `review.py` (three-provider runner: OpenRouter / Gemini API / local Ollama), `pyproject.toml`, `.env.example`, `.gitignore`, this runbook, and a rewritten root `README.md`. See the root README for the full list of fork modifications and how to sync against upstream.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..7753351
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "code-review-openrouter"
+version = "0.1.0"
+description = "OpenRouter-backed code review using the Gemini CLI code-review extension prompts."
+requires-python = ">=3.11"
+dependencies = [
+    "httpx>=0.27",
+    "python-dotenv>=1.0",
+]
+
+[tool.uv]
+package = false
diff --git a/review.py b/review.py
new file mode 100644
index 0000000..38e116e
--- /dev/null
+++ b/review.py
@@ -0,0 +1,1757 @@
+#!/usr/bin/env python3
+"""Standalone code-review runner for the Gemini CLI code-review extension.
+
+This fork keeps the upstream `skills/code-review-commons/SKILL.md` and
+`commands/code-review.toml` prompts intact (Apache-2.0, unmodified) and adds a
+thin Python runner that sends them to a Gemini-or-other-model via one of three
+providers selectable at the command line:
+
+  --provider openrouter (default)
+      POSTs to OpenRouter's OpenAI-compatible chat-completions endpoint
+      (https://openrouter.ai/api/v1/chat/completions). Requires
+      `OPENROUTER_API_KEY`. Good if you want to mix models from different
+      vendors (Gemini, Claude, GPT, DeepSeek) without separate API keys.
+
+  --provider gemini
+      POSTs to Google AI Studio's `generateContent` endpoint directly
+      (https://generativelanguage.googleapis.com/v1beta/models/...). Requires
+      `GEMINI_API_KEY`. Slightly lower latency (one less hop) and uses the
+      same key the GitHub bot uses on the backend.
+
+  --provider ollama
+      POSTs to a local Ollama server's OpenAI-compatible endpoint
+      (http://localhost:11434/v1/chat/completions by default). No API key
+      required -- the server runs on your machine (or in WSL). Override the
+      URL with `--ollama-host` or `$OLLAMA_HOST` if Ollama listens elsewhere
+      (different port, different machine, WSL with non-default networking).
+      Best for offline / private / cost-free review; trade-off is quality
+      and speed depending on local model size and CPU/GPU.
+
+Provider defaults: openrouter -> ``google/gemini-2.5-pro``, gemini ->
+``gemini-2.5-pro``, ollama -> ``qwen3-coder:30b`` (the MoE coder model
+with ~3.3B active params, the quality/speed sweet spot on CPU). The
+``--model <slug>`` flag overrides per call; ``--provider openrouter``
+and ``--provider ollama`` also accept named aliases (see
+``MODEL_ALIASES_BY_PROVIDER`` below) so you can write ``--model claude``
+or ``--model local`` instead of the full slug.
+
+Diff modes (default) review a git diff:
+
+    uv run review.py                          # diff current branch vs origin/HEAD merge-base
+    uv run review.py --base main              # diff vs an explicit base ref
+    uv run review.py --pr 6                   # review a GitHub PR (uses `gh pr diff`)
+    uv run review.py --staged                 # staged changes only
+
+Whole-codebase mode reviews tracked files (filtered):
+
+    uv run review.py --codebase
+    uv run review.py --codebase --include 'backend/**/*.py'
+    uv run review.py --codebase --exclude '**/test_*'
+
+The .env loaded from this script's directory (not CWD) -- copy `.env.example`
+to `.env` once at the runner location and invoke from any project folder.
+Set whichever of `OPENROUTER_API_KEY` / `GEMINI_API_KEY` your chosen provider
+needs (Ollama doesn't need an API key -- just set `OLLAMA_HOST` if your
+server isn't at the default `http://localhost:11434`).
+"""
+
+from __future__ import annotations
+
+import argparse
+import fnmatch
+import os
+import subprocess
+import sys
+import time
+import tomllib
+from pathlib import Path
+from typing import Callable, TypeVar
+
+import httpx
+from dotenv import load_dotenv
+
+ROOT = Path(__file__).parent.resolve()
+
+
+# ---------------------------------------------------------------------------
+# Error model
+# ---------------------------------------------------------------------------
+#
+# The runner is designed to be called by an LLM agent in a loop, not just by a
+# human at a shell. That means the exit code is a contract: an agent caller
+# needs to be able to react differently to "model refused" vs "rate limit"
+# vs "diff too big" vs "network down" without parsing prose. README's "Error
+# model" section is the public-facing version of this table; the constants
+# below are the in-code source of truth.
+#
+# Exit-code budget: 0 success, 1 catchall, 2 config, 10-19 reserved for typed
+# review errors. Codes 3-9 are conventionally Unix-shell-reserved; we skip
+# them.
+
+
+T = TypeVar("T")
+
+
+class ReviewError(Exception):
+    """Base class for typed runner errors.
+
+    Subclasses set ``exit_code``, ``category``, and ``suggested`` so the error
+    formatter in ``main`` can emit a structured stderr block an LLM caller
+    can pattern-match (``ERROR: <CATEGORY> [exit <N>]``).
+    """
+
+    exit_code: int = 1
+    category: str = "UNKNOWN"
+    suggested: str = "Read stderr for details; escalate if unclear."
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        detail: str | None = None,
+        model: str | None = None,
+        provider: str | None = None,
+    ) -> None:
+        super().__init__(message)
+        self.detail = detail
+        self.model = model
+        self.provider = provider
+
+
+class ConfigError(ReviewError):
+    """Missing API key, invalid CLI / env value, or other static-config bug."""
+
+    exit_code = 2
+    category = "CONFIG"
+    suggested = (
+        "Check the relevant env var or CLI flag (see the error message). "
+        "Do not retry without fixing the config -- retry will hit the same "
+        "error."
+    )
+
+
+class SafetyRefusal(ReviewError):
+    """Model refused on content-filter grounds.
+
+    Returned content was null with ``finish_reason`` indicating SAFETY,
+    ``content_filter``, or equivalent. Re-trying the same model with the
+    same prompt almost always reproduces the refusal; switch model first.
+    """
+
+    exit_code = 10
+    category = "SAFETY_REFUSAL"
+    suggested = (
+        "Retry with a different model: ``--model claude`` is the most "
+        "refusal-resistant on security / policy / adversarial-fixture code. "
+        "If still refused across models, the content may need human review."
+    )
+
+
+class RateLimit(ReviewError):
+    """Provider HTTP 429 -- request quota or RPS cap exceeded."""
+
+    exit_code = 11
+    category = "RATE_LIMIT"
+    suggested = (
+        "Wait 30-60 seconds and retry. If the limit is per-key per-day "
+        "(common on free tiers), switch ``--provider`` or ``--model`` to "
+        "one with separate quota."
+    )
+
+
+class ContextOverflow(ReviewError):
+    """Diff exceeded the model's input or output token budget."""
+
+    exit_code = 12
+    category = "CONTEXT_OVERFLOW"
+    suggested = (
+        "Narrow the diff scope: ``--include`` / ``--exclude`` in codebase "
+        "mode, or a smaller ``--base`` ref in diff mode. Do not retry "
+        "without reducing scope -- a second call with the same payload "
+        "will hit the same limit."
+    )
+
+
+class ProviderHiccup(ReviewError):
+    """Null content with no clear safety / length signal.
+
+    Empirically recovers ~always on a single retry. The runner auto-retries
+    once before raising this; if you see this exit code, both attempts
+    failed.
+    """
+
+    exit_code = 13
+    category = "PROVIDER_HICCUP"
+    suggested = (
+        "The runner already auto-retried once. Wait a few seconds and "
+        "retry; if still hicupped, switch ``--provider`` or ``--model``."
+    )
+
+
+class TransportError(ReviewError):
+    """Network / HTTP-5xx failure reaching the provider."""
+
+    exit_code = 14
+    category = "TRANSPORT"
+    suggested = (
+        "Retry with exponential backoff (2s, 4s, 8s). The runner already "
+        "retried once at 2s. If three additional retries fail, check the "
+        "provider's status page; escalate if the provider is up."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Safety context prefix
+# ---------------------------------------------------------------------------
+#
+# Prepended to every review prompt (overridable via --context / --no-context /
+# $CODE_REVIEW_CONTEXT). Lowers the false-positive refusal rate when the
+# diff under review contains words/patterns the model's safety filter
+# associates with harm in other contexts (security testing, policy code,
+# adversarial fixtures, AML domain language, etc.).
+#
+# Wording deliberately avoids "ignore safety guidelines" or similar
+# jailbreak-shaped phrasing -- that itself triggers safety filters. The
+# framing is "this is authorized code review of legitimate work"; the
+# specific examples (sanctions, attack, prompt injection, tampering,
+# redaction) are the words we've observed triggering false-positive
+# refusals on our own PRs.
+
+DEFAULT_CONTEXT = (
+    "The diff below is from a legitimate software-engineering project "
+    "undergoing authorized code review. The code may include defensive "
+    "security measures, adversarial test fixtures, policy enforcement "
+    "logic, or domain language that looks adversarial in isolation "
+    "(e.g. 'sanctions', 'attack', 'prompt injection', 'tampering', "
+    "'redaction'). Treat this as benign code review by the maintainers. "
+    "Do not refuse on the basis of subject matter."
+)
+
+# Provider configuration. The default model slug differs by provider because
+# OpenRouter prefixes vendor names (``google/...``) while the Gemini API
+# accepts the bare model name. Ollama uses its own tag format
+# (``qwen3-coder:30b``) and runs against a local server (no vendor prefix,
+# no API key). Override per-call with ``--model``.
+PROVIDERS = ("openrouter", "gemini", "ollama")
+DEFAULT_MODEL_BY_PROVIDER: dict[str, str] = {
+    "openrouter": "google/gemini-2.5-pro",
+    "gemini": "gemini-2.5-pro",
+    # Ollama default. ``qwen3-coder:30b`` is the MoE coder model with
+    # ~3.3B active params -- best quality/speed balance on CPU. If the
+    # user hasn't pulled it, ``call_ollama`` raises a typed ConfigError
+    # pointing them to ``ollama pull qwen3-coder:30b``. Override per env
+    # with $OLLAMA_MODEL or per call with --model.
+    "ollama": "qwen3-coder:30b",
+}
+DEFAULT_PROVIDER = "openrouter"
+
+OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
+GEMINI_URL_TEMPLATE = (
+    "https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
+)
+# Ollama exposes both its native API (``/api/chat``) and an OpenAI-compatible
+# endpoint (``/v1/chat/completions``). We use the latter because the request/
+# response shape mirrors OpenRouter, which lets us reuse the same
+# finish-reason classification and error-handling patterns.
+DEFAULT_OLLAMA_HOST = "http://localhost:11434"
+OLLAMA_CHAT_PATH = "/v1/chat/completions"
+HTTP_TIMEOUT = 300.0  # Gemini 2.5 Pro on a ~5K-line diff lands ~30-60s; pad
+                     # generously for very large diffs and whole-codebase
+                     # bundles.
+# Local CPU inference on a 30B MoE coder model takes ~10-25 tok/sec on
+# modern hardware, so a thorough review (1500-3000 output tokens) can run
+# 1-5 minutes; cold-start model load adds another 10-60s on the first
+# call after server start. ``HTTP_TIMEOUT`` (300s = 5 min) is too tight
+# for that worst case, so Ollama gets its own ceiling. Override with
+# $OLLAMA_TIMEOUT if you're on slower hardware or running larger models.
+DEFAULT_OLLAMA_TIMEOUT = 1800.0  # 30 minutes
+
+# Sampling temperature for the model. History of this constant:
+#
+#   0.2  (original): too conservative -- 1-2 findings per round on diffs
+#        that plausibly contained more; 5-7 rounds to converge.
+#   0.5  (raised after observing the above): more findings per round
+#        (3-5 typical), but on a later cross-model comparison
+#        ``google/gemini-2.5-pro`` produced a HIGH-severity finding
+#        that referenced a CLI flag (``--timeout``) and quoted "help
+#        text" that don't exist in the codebase -- a clean
+#        hallucination that would have crashed the runner if its
+#        suggested fix had been applied verbatim.
+#   0.3  (current): split the difference. Tighter than 0.5 to cut the
+#        hallucination rate; looser than 0.2 to keep the "surface more
+#        findings" benefit. Empirical re-tuning is encouraged if you
+#        observe drift either way; override per call with
+#        ``--temperature`` or per environment with
+#        ``$CODE_REVIEW_TEMPERATURE``.
+DEFAULT_TEMPERATURE = 0.3
+
+# Maximum output tokens the model may emit. Raised from the implicit
+# provider default (~8K for Gemini 2.5 Pro) to 16K so a thorough review
+# isn't truncated mid-finding. This is a *ceiling*, not a target -- you
+# pay only for tokens actually emitted, not for unused headroom.
+# Override with ``--max-tokens`` or ``$CODE_REVIEW_MAX_TOKENS``.
+DEFAULT_MAX_TOKENS = 16000
+
+# Named aliases for model slugs, scoped per provider. Each provider has
+# its own slug format (OpenRouter: ``vendor/model``, Gemini API: bare
+# ``gemini-...`` names, Ollama: ``family:tag``), and an alias is only
+# valid for its declared provider. Aliases are resolved before the call
+# is made; the resolved slug is what shows up in stderr "Reviewing ...
+# with ..." and what the provider actually dispatches to.
+#
+# When a user passes ``--model <alias>`` with the wrong provider, the
+# runner raises a typed ConfigError pointing them at the correct
+# ``--provider`` instead of silently sending an invalid model name to
+# the upstream API.
+#
+# Keep these tables small and curated. The aliases here are the models
+# that earned their place as a "second-opinion reviewer" in practice;
+# users who want exotic models can pass the raw slug via ``--model``.
+MODEL_ALIASES_BY_PROVIDER: dict[str, dict[str, str]] = {
+    "openrouter": {
+        # Gemini family (OpenRouter route to the same models the
+        # ``--provider gemini`` direct path serves).
+        "pro": "google/gemini-2.5-pro",
+        "gemini-pro": "google/gemini-2.5-pro",
+        "flash": "google/gemini-2.5-flash",
+        "gemini-flash": "google/gemini-2.5-flash",
+        # Anthropic / Claude family.
+        "claude": "anthropic/claude-sonnet-4.5",
+        "claude-sonnet": "anthropic/claude-sonnet-4.5",
+        "claude-opus": "anthropic/claude-opus-4.5",
+        # OpenAI / GPT family. These slugs are current OpenRouter catalog
+        # entries; an older reviewer with a pre-2025 training cutoff may
+        # flag them as nonexistent because GPT-5 / GPT-5-mini postdate that
+        # cutoff. Verify against the live catalog before "fixing" them back
+        # to gpt-4o.
+        "gpt": "openai/gpt-5",
+        "gpt-mini": "openai/gpt-5-mini",
+        # DeepSeek -- cheap, surprisingly strong at code review.
+        "deepseek": "deepseek/deepseek-chat-v3.1",
+    },
+    "ollama": {
+        # Local model tiers, both qwen3-coder. ``local`` is the
+        # recommended default (30B MoE with ~3.3B active params, the
+        # quality/speed sweet spot on CPU because active param count
+        # drives inference speed, not total params). ``local-pro`` is
+        # the larger MoE (80B/3B active) for users who want higher
+        # quality and can spare the disk (~40 GB) for marginal speed
+        # cost. Users who haven't pulled a given tag get a typed
+        # ConfigError from ``call_ollama`` pointing them at the right
+        # ``ollama pull`` command.
+        #
+        # No ``local-fast`` alias: qwen3-coder doesn't ship a small
+        # dense variant on Ollama, and the qwen2.5-coder family is a
+        # generation behind on code review quality. Users who need a
+        # smaller model should pass the explicit ``--model`` slug
+        # (e.g. ``--model qwen2.5-coder:7b``) rather than rely on a
+        # "fast" alias that papers over the generation gap.
+        "local": "qwen3-coder:30b",
+        "local-pro": "qwen3-coder-next",
+    },
+    # ``gemini`` (direct-API) has no aliases -- it takes bare Gemini
+    # model names only. Passing an alias with --provider gemini raises
+    # a ConfigError so the user gets a clear message rather than a
+    # cryptic 404 from Google's API.
+}
+
+# Backwards-compatible flat alias map. Some external tooling or tests
+# may have imported the old top-level ``MODEL_ALIASES`` dict that
+# contained only the OpenRouter aliases. The new authoritative table
+# is ``MODEL_ALIASES_BY_PROVIDER``; this name is preserved as a
+# read-only view over the OpenRouter slice for compatibility.
+MODEL_ALIASES: dict[str, str] = MODEL_ALIASES_BY_PROVIDER["openrouter"]
+
+# Upstream `code-review.toml` instructs the model to *call* git itself via a
+# tool. We have no tool layer; instead we extract the diff up front and
+# substitute it into the prompt. This is the literal sentence from
+# `commands/code-review.toml`; if upstream rewords it the substitution
+# silently no-ops and the script still works (the diff is also appended
+# unconditionally if the substitution missed).
+TOOL_CALL_INSTRUCTION = (
+    "**Code Changes**: call the `git diff -U5 --merge-base origin/HEAD` "
+    "tool to retrieve the changes."
+)
+
+# Substitution sentinel for the codebase-review command template. The
+# fork-added ``commands/codebase-review.toml`` puts this literal token
+# where the codebase bundle goes; if it's missing (e.g. a future rewrite
+# of the command template) the bundle is appended unconditionally so the
+# model still has the content.
+CODEBASE_PLACEHOLDER = "<CODEBASE_BUNDLE_PLACEHOLDER>"
+
+# Whole-codebase mode constants.
+#
+# ``MAX_BUNDLE_CHARS`` is the hard pre-flight cap on the concatenated
+# codebase bundle. 700K chars is ~175K tokens at the standard 4-chars-
+# per-token estimate, which is conservative against both Gemini 2.5 Pro
+# (1M-token context) and Claude Sonnet 4.5 (200K-token context). Cap
+# means the runner errors out before paying for a request that would
+# fail mid-flight on the smaller-context model.
+MAX_BUNDLE_CHARS = 700_000
+
+# Individual-file size cap: skip files larger than this when bundling.
+# Most files this large are data fixtures, vendored blobs, or generated
+# artifacts that ``git ls-files`` happens to track but that don't
+# benefit from a code review. Skipped paths are logged on stderr so the
+# user can ``--include`` them back if a real source file got caught.
+MAX_INDIVIDUAL_FILE_BYTES = 100_000
+
+# Defensive built-in exclusions applied after the user's ``--include``
+# and ``--exclude`` filters. ``git ls-files`` already respects
+# ``.gitignore``, so vendored dirs (``node_modules``, ``.venv``, etc.)
+# are usually already absent. These patterns catch the residue:
+# lock files, minified output, common binary / asset extensions. Match
+# is case-sensitive (file extensions in practice are lowercase).
+BUILTIN_CODEBASE_EXCLUDES: tuple[str, ...] = (
+    # Lock files for various package managers.
+    "*.lock",
+    "package-lock.json",
+    "yarn.lock",
+    "uv.lock",
+    "Cargo.lock",
+    "poetry.lock",
+    "Pipfile.lock",
+    # Minified / generated bundles.
+    "*.min.js",
+    "*.min.css",
+    # Build outputs occasionally tracked by mistake.
+    "*/dist/*",
+    "*/build/*",
+    # Binary / asset extensions: skip outright (model can't review).
+    "*.png", "*.jpg", "*.jpeg", "*.gif", "*.svg", "*.ico", "*.webp",
+    "*.woff", "*.woff2", "*.ttf", "*.eot",
+    "*.pdf", "*.zip", "*.tar", "*.gz",
+    "*.mp3", "*.mp4", "*.mov", "*.avi",
+)
+
+# Per-file delimiter for whole-codebase bundles. The shape is chosen so
+# the model can pattern-match file boundaries reliably and quote the
+# exact path back in its per-file findings.
+FILE_DELIMITER_TEMPLATE = "======== FILE: {path} ========"
+
+
+def load_skill(name: str = "code-review-commons") -> str:
+    """Return the SKILL.md content for the named skill directory.
+
+    Defaults to the upstream ``code-review-commons`` skill (the
+    diff-review one). Whole-codebase mode passes ``code-review-codebase``
+    (fork-added) which differs only in the Critical Constraints section:
+    it permits commenting on any line in any file in the bundle, instead
+    of the upstream skill's hardcoded "only lines beginning with +/-"
+    rule that's correct for diff review but forbids all comments on
+    whole-file input.
+    """
+    path = ROOT / "skills" / name / "SKILL.md"
+    return path.read_text(encoding="utf-8")
+
+
+def load_command_prompt(name: str) -> str:
+    """Load `commands/<name>.toml` and return the `prompt` field verbatim."""
+    path = ROOT / "commands" / f"{name}.toml"
+    data = tomllib.loads(path.read_text(encoding="utf-8"))
+    return data["prompt"]
+
+
+def _apply_context(user_prompt: str, context: str | None) -> str:
+    """Prepend a safety-context block to the user prompt.
+
+    Wrapping the context in a labeled XML-style tag (``<CONTEXT_FOR_REVIEWER>``)
+    rather than free-floating prose keeps the model from accidentally treating
+    the framing as part of the code it should review. ``None`` / empty
+    short-circuits to the bare prompt for ``--no-context`` mode.
+    """
+    if not context:
+        return user_prompt
+    return (
+        "<CONTEXT_FOR_REVIEWER>\n"
+        f"{context}\n"
+        "</CONTEXT_FOR_REVIEWER>\n\n"
+        f"{user_prompt}"
+    )
+
+
+def build_diff_prompts(diff: str, context: str | None) -> tuple[str, str]:
+    """Construct ``(system, user)`` prompts for diff-mode review.
+
+    Loads the upstream ``code-review-commons`` skill (system prompt) and
+    the upstream ``code-review`` command (user prompt template), then
+    substitutes the diff into the command's tool-call placeholder and
+    prepends the optional safety-context block to the user prompt.
+    """
+    system_prompt = load_skill("code-review-commons")
+    user_template = load_command_prompt("code-review")
+    diff_block = f"**Code Changes**:\n\n```diff\n{diff}\n```"
+    user_prompt = user_template.replace(TOOL_CALL_INSTRUCTION, diff_block)
+    # Defensive: if upstream rewords the tool-call sentence and our literal
+    # substitution missed, append the diff so the model still has it.
+    if diff_block not in user_prompt:
+        user_prompt = f"{user_prompt}\n\n{diff_block}"
+    return system_prompt, _apply_context(user_prompt, context)
+
+
+def build_codebase_prompts(bundle: str, context: str | None) -> tuple[str, str]:
+    """Construct ``(system, user)`` prompts for whole-codebase review.
+
+    Uses the fork-added ``code-review-codebase`` skill and
+    ``codebase-review`` command. The skill differs from the upstream
+    ``code-review-commons`` only in the Critical Constraints section:
+    it permits commenting on any line in any file in the bundle (the
+    upstream rule "comment only on +/- lines" forbids all comments on
+    whole-file content, which is correct for diff review but wrong for
+    codebase review).
+
+    TODO: v2 -- architectural-summary mode (proposed flag ``--summary``)
+    would prepend a leading "patterns / structure / smells" section to
+    the per-file findings. Trade-offs (hallucination risk on
+    architectural takes, less actionable output, token-budget
+    contention) documented in ``docs/llm-code-review-runbook.md`` under
+    "Future modes". The current codebase prompt produces per-file
+    findings only.
+    """
+    system_prompt = load_skill("code-review-codebase")
+    user_template = load_command_prompt("codebase-review")
+    bundle_block = f"**Codebase**:\n\n{bundle}\n"
+    if CODEBASE_PLACEHOLDER in user_template:
+        user_prompt = user_template.replace(CODEBASE_PLACEHOLDER, bundle_block)
+    else:
+        # Defensive: same shape as ``build_diff_prompts`` defensive append.
+        user_prompt = f"{user_template}\n\n{bundle_block}"
+    return system_prompt, _apply_context(user_prompt, context)
+
+
+def _format_size(n_bytes: int) -> str:
+    """Format a byte count in decimal-prefix units the way OS file managers do.
+
+    Returns ``"<n> B"`` below 1 KB, ``"<n> KB"`` (1000-based, integer) up to
+    1 MB, and ``"<n.x> MB"`` with one decimal above that. Decimal (1000)
+    rather than binary (1024) so the displayed value matches what users see
+    in their file manager and matches the decimal byte constants declared
+    in this module (``MAX_INDIVIDUAL_FILE_BYTES = 100_000`` reads as
+    "100 KB"). One source of truth for size display means the messages
+    stay sensible if the constants ever change scale.
+    """
+    if n_bytes >= 1_000_000:
+        return f"{n_bytes / 1_000_000:.1f} MB"
+    if n_bytes >= 1_000:
+        return f"{n_bytes // 1_000} KB"
+    return f"{n_bytes} B"
+
+
+def _run_git(args: list[str]) -> str:
+    """Run a git command in the current working directory and return stdout.
+    Surfaces non-zero exits with the command and stderr so the user sees why.
+
+    ``encoding="utf-8"`` is explicit: without it, ``text=True`` decodes via
+    ``locale.getpreferredencoding(False)``, which on Windows is cp1252.
+    Source files containing non-ASCII characters (em-dashes, arrows, the
+    section sign) then come back to Python as mojibake (``â€"`` for ``--``,
+    ``â†'`` for ``->``, ``Â§`` for ``§``), the model sees the mojibake in
+    the diff, and the next review iteration flags "character encoding
+    artifacts in documentation files" -- a finding that doesn't exist in
+    the source, only in the runner's decoding step. ``errors="replace"``
+    is defensive in case git ever emits bytes that aren't valid UTF-8
+    (rare; usually a corrupted file).
+    """
+    try:
+        result = subprocess.run(
+            args,
+            capture_output=True,
+            text=True,
+            check=True,
+            encoding="utf-8",
+            errors="replace",
+        )
+    except subprocess.CalledProcessError as exc:
+        # Raise a typed error rather than ``sys.exit(exc.returncode)``
+        # so an LLM caller sees the documented ``ERROR: CONFIG [exit 2]``
+        # contract instead of a raw subprocess exit code (which collides
+        # with the UNKNOWN/1 bucket). Git failures are almost always a
+        # misconfigured ref / non-git directory / similar setup issue
+        # the caller has to fix before retry makes sense.
+        raise ConfigError(
+            f"`{' '.join(args)}` failed (exit {exc.returncode})",
+            detail=exc.stderr.strip(),
+        ) from exc
+    return result.stdout
+
+
+def git_diff_local(base: str | None, staged: bool) -> str:
+    """Produce a unified diff matching the upstream `git diff -U5
+    --merge-base origin/HEAD` shape so the model sees what the GitHub bot
+    would see.
+
+    Working-tree changes are included by default. The iterative-review use
+    case (run, fix, re-run before committing) breaks if we restrict to
+    ``base...HEAD`` because uncommitted fixes are invisible and the model
+    re-flags the same issues forever. ``git diff <base>`` (two-dot;
+    merge-base..working-tree) covers committed + staged + unstaged in one
+    pass, which is what "review everything I'm proposing to ship" means.
+    """
+    if staged:
+        return _run_git(["git", "diff", "--cached", "-U5"])
+    if base:
+        return _run_git(["git", "diff", "-U5", base])
+    return _run_git(["git", "diff", "-U5", "--merge-base", "origin/HEAD"])
+
+
+def pr_diff(pr_number: int) -> str:
+    """Pull a PR's diff via `gh`. Requires `gh auth login` to have run.
+
+    Same Windows-locale concern as ``_run_git``: explicit
+    ``encoding="utf-8"`` keeps PR diffs containing em-dashes, arrows,
+    section signs, and other non-ASCII characters from being mangled
+    into cp1252 mojibake before the model sees them.
+    """
+    try:
+        result = subprocess.run(
+            ["gh", "pr", "diff", str(pr_number), "--patch"],
+            capture_output=True,
+            text=True,
+            check=True,
+            encoding="utf-8",
+            errors="replace",
+        )
+    except FileNotFoundError as exc:
+        # Same typed-error contract as ``_run_git``: callers see
+        # ``ERROR: CONFIG [exit 2]`` and know to install ``gh`` /
+        # adjust PATH before retrying.
+        raise ConfigError(
+            "`gh` not found on PATH. Install GitHub CLI to use --pr.",
+        ) from exc
+    except subprocess.CalledProcessError as exc:
+        raise ConfigError(
+            f"`gh pr diff {pr_number}` failed (exit {exc.returncode})",
+            detail=exc.stderr.strip(),
+        ) from exc
+    return result.stdout
+
+
+def _glob_match(path: Path, patterns: tuple[str, ...] | list[str]) -> bool:
+    """Return True if ``path`` matches any of the glob ``patterns``.
+
+    Each pattern is tested against both the full POSIX path
+    (e.g. ``backend/api/views.py``) and the basename (e.g. ``views.py``)
+    so a pattern like ``test_*.py`` catches test files at any depth
+    rather than only at the repo root. fnmatch treats ``*`` as matching
+    everything including ``/``, so ``*.py`` matches all Python files
+    regardless of nesting; this is intentional and documented.
+
+    Verified: ``fnmatch.fnmatch("foo/bar.py", "*.py") == True``. Python
+    docs (fnmatch module): "Note that the filename separator (os.sep on
+    Unix) is not special to this module." This is the OPPOSITE of
+    ``glob`` semantics where ``*`` stops at ``/``; do not assume glob
+    behavior when reading or modifying this function.
+
+    The ``tuple[str, ...] | list[str]`` signature is intentionally
+    explicit rather than ``Sequence[str]`` -- a ``str`` is itself a
+    ``Sequence[str]`` (it iterates as single-character strings), so a
+    looser annotation would silently accept a caller bug like
+    ``_glob_match(p, "*.py")`` and iterate over individual characters
+    instead of treating the string as one pattern. The verbose union
+    blocks that footgun at type-check time.
+    """
+    posix = path.as_posix()
+    name = path.name
+    return any(
+        fnmatch.fnmatch(posix, pat) or fnmatch.fnmatch(name, pat)
+        for pat in patterns
+    )
+
+
+def gather_codebase_files(
+    includes: list[str], excludes: list[str]
+) -> list[Path]:
+    """Return the list of files to bundle for whole-codebase review.
+
+    Pipeline (in order):
+      1. ``git ls-files`` -> all tracked files (so ``.gitignore`` already
+         excludes ``node_modules``, ``.venv``, build artifacts, etc.).
+      2. Apply ``--include`` globs if any; otherwise keep all files.
+      3. Apply user-supplied ``--exclude`` globs.
+      4. Apply ``BUILTIN_CODEBASE_EXCLUDES`` (lock files, asset
+         extensions, etc.).
+      5. Drop files larger than ``MAX_INDIVIDUAL_FILE_BYTES``; log to
+         stderr so the user can ``--include`` them back if needed.
+
+    Returns paths relative to the current working directory (which is
+    expected to be the project being reviewed, since we run ``git
+    ls-files`` against CWD).
+    """
+    output = _run_git(["git", "ls-files"])
+    paths = [Path(line) for line in output.splitlines() if line]
+
+    # Step 2: user --include filter.
+    if includes:
+        paths = [p for p in paths if _glob_match(p, includes)]
+
+    # Step 3: user --exclude filter.
+    if excludes:
+        paths = [p for p in paths if not _glob_match(p, excludes)]
+
+    # Step 4: built-in defensive excludes.
+    paths = [p for p in paths if not _glob_match(p, BUILTIN_CODEBASE_EXCLUDES)]
+
+    # Step 5: per-file size cap.
+    kept: list[Path] = []
+    for p in paths:
+        try:
+            size = p.stat().st_size
+        except OSError:
+            # File listed by ``git ls-files`` but missing on disk
+            # (typo, symlink to nowhere, race). Skip silently rather
+            # than crash; the user can re-run if a file was meant to
+            # be present.
+            continue
+        if size > MAX_INDIVIDUAL_FILE_BYTES:
+            sys.stderr.write(
+                f"skip (>{_format_size(MAX_INDIVIDUAL_FILE_BYTES)}): "
+                f"{p.as_posix()} ({_format_size(size)})\n"
+            )
+            continue
+        kept.append(p)
+
+    return kept
+
+
+def _number_lines(content: str) -> str:
+    """Prefix each line with its 1-indexed line number, ``cat -n`` style.
+
+    LLMs cannot reliably count lines in long files. Without explicit
+    line numbers in the bundle, the model estimates line positions from
+    visual context and drifts by 50-150 lines on files >500 lines (and
+    5-15 lines on files >100). Prefixing every line with its number
+    turns "report a line number" from an arithmetic task (which
+    transformers cannot do reliably) into a transcription task (which
+    they do well).
+
+    Diff mode doesn't need this -- ``git diff -U5`` already embeds
+    ``@@ -L,N +L,N @@`` anchors plus context lines, so the model
+    transcribes from those. The whole-codebase bundle has no such
+    anchors, which is why this helper exists.
+
+    Format: ``<width>d: <line>``, right-aligned, minimum 6-char width
+    matching ``cat -n``. Trailing newline (if any) is preserved.
+    """
+    if not content:
+        return content
+    had_trailing_newline = content.endswith("\n")
+    lines = content.splitlines()
+    if not lines:
+        return content
+    width = max(6, len(str(len(lines))))
+    numbered = "\n".join(
+        f"{index:>{width}d}: {line}"
+        for index, line in enumerate(lines, start=1)
+    )
+    return numbered + "\n" if had_trailing_newline else numbered
+
+
+def bundle_codebase(file_paths: list[Path]) -> str:
+    """Concatenate the given files into a single delimited bundle.
+
+    Encoding errors are replaced silently (``errors="replace"``) so a
+    non-UTF8 byte sequence in a tracked file doesn't crash the runner;
+    the model sees a replacement character but the rest of the file is
+    still reviewable. In practice this only triggers on files that
+    should have been excluded by the asset-extension filter -- a true
+    source file with a stray non-UTF8 byte is rare.
+
+    Each file's content is line-numbered via ``_number_lines`` before
+    bundling so the model can reference accurate line numbers in its
+    findings without having to count lines itself. See that helper for
+    the rationale.
+    """
+    parts: list[str] = []
+    for path in file_paths:
+        content = path.read_text(encoding="utf-8", errors="replace")
+        delimiter = FILE_DELIMITER_TEMPLATE.format(path=path.as_posix())
+        parts.append(f"{delimiter}\n{_number_lines(content)}")
+    return "\n\n".join(parts)
+
+
+def _classify_http_error(
+    status: int,
+    body: str,
+    *,
+    model: str,
+    provider: str,
+) -> ReviewError:
+    """Map a 4xx/5xx response to the right typed error.
+
+    Provider-side messages are inconsistent across vendors, so we pattern-
+    match on a small set of substrings the major providers use today. The
+    classification is best-effort -- a future provider could land a 4xx
+    that doesn't match any pattern, and that's fine: it falls through to
+    a generic ``ReviewError`` which surfaces as exit 1 with the response
+    body in ``detail`` so a caller can decide.
+    """
+    body_lower = body.lower()
+    if status == 429:
+        return RateLimit(
+            f"{provider} returned HTTP 429",
+            detail=body[:1000],
+            model=model,
+            provider=provider,
+        )
+    # Match the family of provider phrases that signal context-length
+    # overflow, kept as a tuple so adding a new variant is a one-line
+    # edit. Each phrase was added based on an actual provider response
+    # observed in the wild or in published vendor docs; do not add
+    # speculative phrases without verifying a provider uses them, or
+    # you risk false-positive ContextOverflow classifications on
+    # unrelated 4xx errors.
+    CONTEXT_OVERFLOW_PHRASES = (
+        "context_length",
+        "too long",
+        "exceeds the maximum",
+        "token limit",
+        "input too large",
+        "payload size",
+    )
+    if status == 413 or any(phrase in body_lower for phrase in CONTEXT_OVERFLOW_PHRASES):
+        return ContextOverflow(
+            f"{provider} returned HTTP {status} with context-length indication",
+            detail=body[:1000],
+            model=model,
+            provider=provider,
+        )
+    if 500 <= status < 600:
+        return TransportError(
+            f"{provider} returned HTTP {status} (provider-side failure)",
+            detail=body[:1000],
+            model=model,
+            provider=provider,
+        )
+    return ReviewError(
+        f"{provider} returned HTTP {status}",
+        detail=body[:1000],
+        model=model,
+        provider=provider,
+    )
+
+
+def call_openrouter(
+    *,
+    system_prompt: str,
+    user_prompt: str,
+    model: str,
+    api_key: str,
+    referer: str,
+    title: str,
+    temperature: float,
+    max_tokens: int,
+) -> str:
+    """POST to OpenRouter's chat-completions endpoint and return the review
+    markdown. Caller builds the prompts so this function stays mode-agnostic
+    (diff review and codebase review share the same wire path).
+
+    Raises typed ``ReviewError`` subclasses on failure -- see the README's
+    "Error model" section for the contract. ``main`` catches and formats
+    them with the correct exit code.
+    """
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        # Temperature broadens exploration so more findings surface per
+        # call; the upstream prompt's "Critical Constraints" section
+        # still gates *quality*. ``max_tokens`` is a ceiling so a
+        # thorough review isn't truncated mid-finding -- the user pays
+        # only for tokens actually emitted, not the unused headroom.
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        # OpenRouter surfaces these in its dashboard for attribution; they
+        # also help the platform-side routing decide which provider tier to
+        # use. Harmless if absent but recommended.
+        "HTTP-Referer": referer,
+        "X-Title": title,
+    }
+
+    try:
+        with httpx.Client(timeout=HTTP_TIMEOUT) as client:
+            response = client.post(OPENROUTER_URL, headers=headers, json=payload)
+    except httpx.RequestError as exc:
+        # Network-level failure: DNS, TCP, timeout, connection reset.
+        # Distinct from a provider 5xx (which is also a transport-class
+        # error but at least returned a response).
+        raise TransportError(
+            f"OpenRouter request failed before response: {exc}",
+            model=model,
+            provider="openrouter",
+        ) from exc
+
+    if response.status_code >= 400:
+        raise _classify_http_error(
+            response.status_code, response.text, model=model, provider="openrouter"
+        )
+
+    try:
+        data = response.json()
+    except ValueError as exc:
+        raise ProviderHiccup(
+            f"OpenRouter returned non-JSON response: {exc}",
+            detail=response.text[:1000],
+            model=model,
+            provider="openrouter",
+        ) from exc
+
+    choices = data.get("choices") or []
+    if not choices:
+        raise ProviderHiccup(
+            "OpenRouter response had no choices",
+            detail=str(data)[:1000],
+            model=model,
+            provider="openrouter",
+        )
+    choice = choices[0]
+    # Inspect both ``finish_reason`` (OpenAI-standard normalized value)
+    # and ``native_finish_reason`` (OpenRouter's pass-through of the
+    # underlying provider's raw reason). They can disagree: a provider
+    # that safety-blocked a completion may surface as
+    # ``native_finish_reason="safety"`` while the normalized
+    # ``finish_reason="stop"`` -- preferring one over the other would
+    # miss that signal. We classify by the UNION of both fields, so a
+    # safety signal from either source wins over a generic "stop".
+    finish_reason = (choice.get("finish_reason") or "").lower()
+    native_finish_reason = (choice.get("native_finish_reason") or "").lower()
+    finish_reasons = {finish_reason, native_finish_reason} - {""}
+    message = choice.get("message") or {}
+    content = message.get("content")
+
+    if content:
+        return content
+
+    # Null / empty content -- classify by the union of finish_reasons.
+    if finish_reasons & {"safety", "content_filter", "content-filter", "blocked"}:
+        raise SafetyRefusal(
+            f"Model refused with finish_reasons={sorted(finish_reasons)!r}",
+            detail=str(data)[:1000],
+            model=model,
+            provider="openrouter",
+        )
+    if finish_reasons & {"length", "max_tokens"}:
+        raise ContextOverflow(
+            f"Hit max_tokens ({max_tokens}) before producing content "
+            f"(finish_reasons={sorted(finish_reasons)!r})",
+            detail=str(data)[:1000],
+            model=model,
+            provider="openrouter",
+        )
+    raise ProviderHiccup(
+        f"Null content with finish_reasons={sorted(finish_reasons)!r}",
+        detail=str(data)[:1000],
+        model=model,
+        provider="openrouter",
+    )
+
+
+def call_gemini(
+    *,
+    system_prompt: str,
+    user_prompt: str,
+    model: str,
+    api_key: str,
+    temperature: float,
+    max_tokens: int,
+) -> str:
+    """POST to Google AI Studio's ``generateContent`` endpoint directly.
+    Caller builds the prompts (same as ``call_openrouter``) so the wire
+    path is mode-agnostic. Raises typed ``ReviewError`` subclasses; see
+    README "Error model".
+    """
+    payload = {
+        "contents": [
+            {"role": "user", "parts": [{"text": user_prompt}]},
+        ],
+        "systemInstruction": {"parts": [{"text": system_prompt}]},
+        "generationConfig": {
+            # camelCase keys per the v1beta generateContent spec.
+            # ``maxOutputTokens`` is the ceiling on generated tokens;
+            # ``temperature`` matches the OpenRouter side so review
+            # behavior is consistent across providers.
+            "temperature": temperature,
+            "maxOutputTokens": max_tokens,
+        },
+    }
+    headers = {
+        "Content-Type": "application/json",
+        # ``x-goog-api-key`` is the documented auth header for the v1beta
+        # generative-language API. Passing the key as a query string also
+        # works but leaks it into shell history / proxy logs more readily.
+        "x-goog-api-key": api_key,
+    }
+    url = GEMINI_URL_TEMPLATE.format(model=model)
+
+    try:
+        with httpx.Client(timeout=HTTP_TIMEOUT) as client:
+            response = client.post(url, headers=headers, json=payload)
+    except httpx.RequestError as exc:
+        raise TransportError(
+            f"Gemini request failed before response: {exc}",
+            model=model,
+            provider="gemini",
+        ) from exc
+
+    if response.status_code >= 400:
+        raise _classify_http_error(
+            response.status_code, response.text, model=model, provider="gemini"
+        )
+
+    try:
+        data = response.json()
+    except ValueError as exc:
+        raise ProviderHiccup(
+            f"Gemini API returned non-JSON response: {exc}",
+            detail=response.text[:1000],
+            model=model,
+            provider="gemini",
+        ) from exc
+
+    # Gemini can refuse at the prompt level (before generation) by
+    # returning ``promptFeedback.blockReason`` with no candidates.
+    prompt_feedback = data.get("promptFeedback") or {}
+    block_reason = prompt_feedback.get("blockReason")
+    if block_reason:
+        raise SafetyRefusal(
+            f"Gemini blocked prompt: blockReason={block_reason!r}",
+            detail=str(data)[:1000],
+            model=model,
+            provider="gemini",
+        )
+
+    candidates = data.get("candidates") or []
+    if not candidates:
+        raise ProviderHiccup(
+            "Gemini response had no candidates",
+            detail=str(data)[:1000],
+            model=model,
+            provider="gemini",
+        )
+    candidate = candidates[0]
+    finish_reason = (candidate.get("finishReason") or "").upper()
+    content_block = candidate.get("content") or {}
+    parts = content_block.get("parts") or []
+    text = "".join(part.get("text", "") for part in parts)
+
+    if text:
+        return text
+
+    # Null / empty content -- classify by finishReason.
+    if finish_reason == "SAFETY":
+        raise SafetyRefusal(
+            "Gemini refused output with finishReason=SAFETY",
+            detail=str(data)[:1000],
+            model=model,
+            provider="gemini",
+        )
+    if finish_reason == "MAX_TOKENS":
+        raise ContextOverflow(
+            f"Hit maxOutputTokens ({max_tokens}) before producing content "
+            "(finishReason=MAX_TOKENS)",
+            detail=str(data)[:1000],
+            model=model,
+            provider="gemini",
+        )
+    raise ProviderHiccup(
+        f"Empty content with finishReason={finish_reason!r}",
+        detail=str(data)[:1000],
+        model=model,
+        provider="gemini",
+    )
+
+
+def call_ollama(
+    *,
+    system_prompt: str,
+    user_prompt: str,
+    model: str,
+    host: str,
+    temperature: float,
+    max_tokens: int,
+    timeout: float,
+) -> str:
+    """POST to a local Ollama server via its OpenAI-compatible chat-completions
+    endpoint. No API key required (local). Caller builds the prompts (same as
+    ``call_openrouter`` and ``call_gemini``) so the wire path is mode-agnostic.
+    Raises typed ``ReviewError`` subclasses; see README "Error model".
+
+    Differences from cloud providers:
+
+    - **No auth header.** Local server, no token-bearer logic.
+    - **First request after server start is slow** (~10-60s) because Ollama
+      lazy-loads the model into RAM. Subsequent requests within the
+      keep-alive window are fast. ``timeout`` is generously larger than
+      ``HTTP_TIMEOUT`` for cloud providers to absorb this cold start.
+    - **Connection refused** is a distinct failure mode (server not
+      running) versus a 4xx/5xx (server up, problem with the request).
+      We surface it as a ``ConfigError`` so the user gets actionable
+      guidance instead of a generic transport error.
+    - **404 means the model isn't pulled**, not "endpoint not found."
+      The body usually contains the model name; we surface it as a
+      ConfigError pointing at the right ``ollama pull`` command.
+    - **No content filter / safety mode.** Ollama doesn't refuse output
+      on safety grounds, so the empty-content classifier checks only
+      length / max-tokens / generic hiccup -- no SafetyRefusal branch.
+    """
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        # Temperature and max_tokens use the same keys as OpenRouter
+        # (Ollama's OpenAI-compat endpoint mirrors that shape).
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        # Force non-streaming so we get the whole response in one JSON
+        # blob -- simpler to parse and matches how we handle the cloud
+        # providers.
+        "stream": False,
+    }
+    headers = {
+        "Content-Type": "application/json",
+    }
+    url = host.rstrip("/") + OLLAMA_CHAT_PATH
+
+    try:
+        with httpx.Client(timeout=timeout) as client:
+            response = client.post(url, headers=headers, json=payload)
+    except httpx.ConnectError as exc:
+        # Server unreachable. Most likely Ollama isn't running, or
+        # OLLAMA_HOST points at the wrong place. ConfigError so the
+        # caller (human or LLM agent) doesn't retry pointlessly -- they
+        # need to fix config first.
+        raise ConfigError(
+            f"Cannot reach Ollama at {host}. Is the server running? "
+            f"Start it with `ollama serve` (or `wsl -d Ubuntu -- ollama serve` "
+            f"if Ollama lives in WSL). Override the URL with --ollama-host "
+            f"or $OLLAMA_HOST if it's on a non-default port/machine.",
+            detail=str(exc),
+            model=model,
+            provider="ollama",
+        ) from exc
+    except httpx.RequestError as exc:
+        # Other network-layer failure (timeout reading response, DNS for
+        # a non-localhost host, etc.). TransportError so the auto-retry
+        # in ``_retry_on_recoverable`` gives it one more try before
+        # raising.
+        raise TransportError(
+            f"Ollama request to {host} failed: {exc}",
+            model=model,
+            provider="ollama",
+        ) from exc
+
+    # 404 from Ollama means the model isn't pulled. The response body
+    # typically includes the model name and a "try pulling first"
+    # message. Surface as a ConfigError with the exact pull command so
+    # the user can fix it in one step.
+    if response.status_code == 404:
+        raise ConfigError(
+            f"Model `{model}` not available on Ollama server at {host}. "
+            f"Run `ollama pull {model}` (or `wsl -d Ubuntu -- ollama pull "
+            f"{model}` if Ollama is in WSL), then retry.",
+            detail=response.text[:500],
+            model=model,
+            provider="ollama",
+        )
+
+    if response.status_code >= 400:
+        raise _classify_http_error(
+            response.status_code, response.text, model=model, provider="ollama"
+        )
+
+    try:
+        data = response.json()
+    except ValueError as exc:
+        raise ProviderHiccup(
+            f"Ollama returned non-JSON response: {exc}",
+            detail=response.text[:1000],
+            model=model,
+            provider="ollama",
+        ) from exc
+
+    choices = data.get("choices") or []
+    if not choices:
+        raise ProviderHiccup(
+            "Ollama response had no choices",
+            detail=str(data)[:1000],
+            model=model,
+            provider="ollama",
+        )
+    choice = choices[0]
+    finish_reason = (choice.get("finish_reason") or "").lower()
+    message = choice.get("message") or {}
+    content = message.get("content")
+
+    if content:
+        return content
+
+    # Null / empty content. Ollama doesn't have content-filter / safety
+    # refusals, so the only diagnosable empty-content cause is hitting
+    # the max-token ceiling. Anything else falls through to a generic
+    # ProviderHiccup which the caller can retry once.
+    if finish_reason in {"length", "max_tokens"}:
+        raise ContextOverflow(
+            f"Hit max_tokens ({max_tokens}) before producing content "
+            f"(finish_reason={finish_reason!r})",
+            detail=str(data)[:1000],
+            model=model,
+            provider="ollama",
+        )
+    raise ProviderHiccup(
+        f"Ollama returned empty content with finish_reason={finish_reason!r}",
+        detail=str(data)[:1000],
+        model=model,
+        provider="ollama",
+    )
+
+
+def _retry_on_recoverable(call: Callable[[], T], *, label: str) -> T:
+    """Run ``call``; on ``ProviderHiccup`` or ``TransportError`` retry once.
+
+    Single retry only -- we deliberately do NOT compound on safety refusals
+    or rate limits (a second call to the same model with the same prompt
+    almost always reproduces those, and burns tokens) or context overflows
+    (the scope is wrong, not the call). Caller can chain its own retries
+    on the surviving exception if it wants exponential backoff.
+
+    ``label`` shows up in the retry-notice stderr line so a viewer scrolling
+    the log can tell which call retried.
+    """
+    try:
+        return call()
+    except (ProviderHiccup, TransportError) as exc:
+        sys.stderr.write(
+            f"[retry] {exc.category} on first attempt ({label}); "
+            "retrying once in 2s...\n"
+        )
+        time.sleep(2)
+        return call()
+
+
+def _resolve_model(args: argparse.Namespace) -> str:
+    """Resolve the final model slug from CLI flag, env var, alias table,
+    or provider default.
+
+    Aliases are scoped per provider (see ``MODEL_ALIASES_BY_PROVIDER``):
+    OpenRouter aliases like ``claude`` resolve only with --provider
+    openrouter; Ollama aliases like ``local`` resolve only with
+    --provider ollama. Using an alias from the wrong table raises a
+    typed ``ConfigError`` pointing the caller at the correct
+    ``--provider`` instead of silently sending an invalid slug to the
+    upstream API.
+    """
+    if args.model is not None:
+        model = args.model
+    elif args.provider == "openrouter":
+        model = os.getenv("OPENROUTER_MODEL", DEFAULT_MODEL_BY_PROVIDER["openrouter"])
+    elif args.provider == "gemini":
+        model = os.getenv("GEMINI_MODEL", DEFAULT_MODEL_BY_PROVIDER["gemini"])
+    else:  # ollama
+        model = os.getenv("OLLAMA_MODEL", DEFAULT_MODEL_BY_PROVIDER["ollama"])
+
+    # Resolve via this provider's alias table.
+    provider_aliases = MODEL_ALIASES_BY_PROVIDER.get(args.provider, {})
+    if model in provider_aliases:
+        return provider_aliases[model]
+
+    # If the model name matches an alias for a DIFFERENT provider, the
+    # user almost certainly meant to switch providers. Surface that with
+    # a typed ConfigError naming the right --provider, so an LLM caller
+    # parsing stderr can self-correct instead of hammering the wrong
+    # endpoint.
+    for other_provider, other_aliases in MODEL_ALIASES_BY_PROVIDER.items():
+        if other_provider == args.provider:
+            continue
+        if model in other_aliases:
+            raise ConfigError(
+                f"Model alias `{model}` is only valid with "
+                f"--provider {other_provider} (currently --provider "
+                f"{args.provider}). Either switch with "
+                f"--provider {other_provider}, or pass an actual model "
+                f"name supported by --provider {args.provider}."
+            )
+
+    return model
+
+
+def _print_error(err: ReviewError) -> None:
+    """Emit a structured stderr block for ``err``.
+
+    First line is the stable machine-parseable prefix:
+    ``ERROR: <CATEGORY> [exit <N>]``. Subsequent lines are human-readable
+    detail. LLM callers can grep for the first line to classify; humans can
+    read the body.
+    """
+    sys.stderr.write(f"ERROR: {err.category} [exit {err.exit_code}]\n")
+    sys.stderr.write(f"Reason: {err}\n")
+    if err.model:
+        sys.stderr.write(f"Model: {err.model}\n")
+    if err.provider:
+        sys.stderr.write(f"Provider: {err.provider}\n")
+    sys.stderr.write(f"Suggested: {err.suggested}\n")
+    if err.detail:
+        # Truncate to keep the stderr block scannable; full body is on
+        # ``err.detail`` if a caller wants programmatic access.
+        snippet = err.detail if len(err.detail) <= 400 else err.detail[:400] + "..."
+        sys.stderr.write(f"Detail: {snippet}\n")
+
+
+def _ensure_utf8_stdout() -> None:
+    """Reconfigure stdout/stderr to UTF-8 if they aren't already.
+
+    The model's output regularly contains Unicode characters (``->`` rendered
+    as ``\\u2192``, em-dashes, smart quotes, mermaid arrows) that cp1252 -- the
+    default stdout encoding on Windows -- cannot encode. Without this, the
+    very last line of main(), ``print(output)``, crashes with
+    ``UnicodeEncodeError`` after the model call has already succeeded and the
+    user has already paid for the tokens. Forcing UTF-8 with
+    ``errors="replace"`` keeps the runner robust on Windows without changing
+    anything on macOS/Linux (which are already UTF-8 by default).
+    """
+    for stream in (sys.stdout, sys.stderr):
+        if hasattr(stream, "reconfigure"):
+            try:
+                if (stream.encoding or "").lower() != "utf-8":
+                    stream.reconfigure(encoding="utf-8", errors="replace")
+            except (AttributeError, ValueError):
+                # Best-effort. Some shells / CI pipes wrap stdout in a way
+                # that doesn't expose ``reconfigure``; in those cases we
+                # fall through and accept a possible UnicodeEncodeError
+                # rather than hide a real configuration problem.
+                pass
+
+
+def main() -> None:
+    _ensure_utf8_stdout()
+    # Load env from this script's directory so `.env` is configured once at
+    # the runner location rather than in every project we review from.
+    load_dotenv(ROOT / ".env")
+
+    parser = argparse.ArgumentParser(
+        description=(
+            "Standalone code-review runner using the Gemini CLI "
+            "code-review extension prompts. Sends them to a Gemini-or-"
+            "other model via OpenRouter or the Gemini API directly."
+        )
+    )
+    source = parser.add_mutually_exclusive_group()
+    source.add_argument(
+        "--base",
+        help="Base ref to diff against (e.g. main, origin/main).",
+    )
+    source.add_argument(
+        "--pr",
+        type=int,
+        help="GitHub PR number to review (uses `gh pr diff`).",
+    )
+    source.add_argument(
+        "--staged",
+        action="store_true",
+        help="Review staged changes only.",
+    )
+    source.add_argument(
+        "--codebase",
+        action="store_true",
+        help=(
+            "Review the whole tracked codebase via ``git ls-files`` "
+            "instead of a diff. Narrow with --include / --exclude. "
+            "Output shape is per-file findings (severity-tagged) -- "
+            "the architectural-summary shape is a v2 TODO documented "
+            "in the runbook's 'Future modes' section."
+        ),
+    )
+    parser.add_argument(
+        "--include",
+        action="append",
+        default=[],
+        metavar="GLOB",
+        help=(
+            "Glob to include in --codebase mode (e.g. "
+            "``backend/**/*.py``). Can be passed multiple times. "
+            "Ignored outside --codebase."
+        ),
+    )
+    parser.add_argument(
+        "--exclude",
+        action="append",
+        default=[],
+        metavar="GLOB",
+        help=(
+            "Glob to exclude in --codebase mode (e.g. "
+            "``**/test_*.py``). Can be passed multiple times. "
+            "Ignored outside --codebase."
+        ),
+    )
+    parser.add_argument(
+        "--provider",
+        choices=PROVIDERS,
+        default=os.getenv("CODE_REVIEW_PROVIDER", DEFAULT_PROVIDER),
+        help=(
+            "Which API to call. ``openrouter`` (default) goes through "
+            "OpenRouter's chat-completions endpoint and needs "
+            "``OPENROUTER_API_KEY``. ``gemini`` calls Google AI Studio's "
+            "generateContent endpoint directly and needs ``GEMINI_API_KEY``. "
+            "Override with $CODE_REVIEW_PROVIDER."
+        ),
+    )
+    parser.add_argument(
+        "--model",
+        default=None,
+        help=(
+            "Model slug or alias. Defaults to the provider-appropriate "
+            "value (``google/gemini-2.5-pro`` for openrouter, "
+            "``gemini-2.5-pro`` for gemini, ``qwen3-coder:30b`` for "
+            "ollama). Override with $OPENROUTER_MODEL / $GEMINI_MODEL / "
+            "$OLLAMA_MODEL respectively. Aliases: pro/flash/claude/"
+            "claude-opus/gpt/gpt-mini/deepseek (openrouter); "
+            "local/local-pro (ollama). ``gemini-2.5-flash`` / "
+            "``flash`` is ~3x faster with some quality loss."
+        ),
+    )
+    parser.add_argument(
+        "--ollama-host",
+        default=None,
+        metavar="URL",
+        help=(
+            "Ollama server URL when --provider ollama. Defaults to "
+            f"{DEFAULT_OLLAMA_HOST} or $OLLAMA_HOST. Useful if Ollama "
+            "is on a non-default port, on another machine, or running "
+            "inside WSL with non-default networking. Ignored for other "
+            "providers."
+        ),
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        metavar="FLOAT",
+        help=(
+            f"Sampling temperature. Default {DEFAULT_TEMPERATURE} -- "
+            "tuned between the original 0.2 (too conservative, "
+            "missed real findings) and a brief 0.5 default (caught "
+            "more but produced hallucinated findings on cross-model "
+            "review). Range typically 0.0-1.0; higher widens "
+            "exploration at higher hallucination risk. Override with "
+            "$CODE_REVIEW_TEMPERATURE."
+        ),
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=None,
+        metavar="N",
+        dest="max_tokens",
+        help=(
+            f"Maximum output tokens the model may emit. Default "
+            f"{DEFAULT_MAX_TOKENS} -- raised from the implicit ~8K "
+            "provider default so a thorough review isn't truncated "
+            "mid-finding. This is a ceiling, not a target: you pay only "
+            "for tokens actually emitted. Override with "
+            "$CODE_REVIEW_MAX_TOKENS."
+        ),
+    )
+    parser.add_argument(
+        "--context",
+        default=None,
+        metavar="TEXT",
+        help=(
+            "Safety-context prefix prepended to every review prompt. "
+            "Reduces false-positive content-filter refusals on security "
+            "/ policy / adversarial-fixture code (the kind that contains "
+            "words like 'attack', 'sanctions', 'prompt injection' out of "
+            "context). Defaults to a generic 'authorized code review' "
+            "framing; override with this flag or $CODE_REVIEW_CONTEXT to "
+            "match your project's subject matter."
+        ),
+    )
+    parser.add_argument(
+        "--no-context",
+        action="store_true",
+        help=(
+            "Disable the safety-context prefix entirely. Useful only if "
+            "the default phrasing itself is what triggers a refusal "
+            "(rare). Mutually exclusive with --context."
+        ),
+    )
+    args = parser.parse_args()
+
+    if args.no_context and args.context is not None:
+        raise ConfigError(
+            "--no-context and --context are mutually exclusive. Pick one."
+        )
+
+    model = _resolve_model(args)
+
+    # The two blocks below deliberately duplicate the (CLI -> env ->
+    # default + validate) resolution pattern instead of extracting a
+    # generic helper. Two call sites is too few to justify a 6-param
+    # abstraction (cli_val, env_var_name, default, converter, type_name,
+    # flag_name), and keeping the env-var names inline makes them
+    # grep-discoverable ("where does CODE_REVIEW_TEMPERATURE get
+    # resolved?" returns a single hit). Revisit if a third tunable
+    # arrives.
+    # Resolve temperature: explicit CLI flag wins, then env, then default.
+    if args.temperature is not None:
+        temperature = args.temperature
+    else:
+        env_temp = os.getenv("CODE_REVIEW_TEMPERATURE")
+        try:
+            temperature = float(env_temp) if env_temp is not None else DEFAULT_TEMPERATURE
+        except ValueError as exc:
+            raise ConfigError(
+                f"$CODE_REVIEW_TEMPERATURE={env_temp!r} is not a valid float. "
+                "Unset it or pass --temperature explicitly."
+            ) from exc
+    # Validate range here rather than letting the provider 4xx -- catches
+    # the misconfig as a typed CONFIG error (exit 2) the LLM caller can
+    # react to, instead of an opaque provider UNKNOWN. ``2.0`` is the
+    # common ceiling across OpenAI / Anthropic / Gemini; providers that
+    # accept higher will simply not see it, which is fine.
+    if not 0.0 <= temperature <= 2.0:
+        raise ConfigError(
+            f"temperature={temperature} is out of range [0.0, 2.0]. "
+            "Set --temperature or $CODE_REVIEW_TEMPERATURE to a value "
+            "in that range."
+        )
+
+    # Resolve max_tokens with the same precedence.
+    if args.max_tokens is not None:
+        max_tokens = args.max_tokens
+    else:
+        env_max = os.getenv("CODE_REVIEW_MAX_TOKENS")
+        try:
+            max_tokens = int(env_max) if env_max is not None else DEFAULT_MAX_TOKENS
+        except ValueError as exc:
+            raise ConfigError(
+                f"$CODE_REVIEW_MAX_TOKENS={env_max!r} is not a valid integer. "
+                "Unset it or pass --max-tokens explicitly."
+            ) from exc
+    if max_tokens <= 0:
+        raise ConfigError(
+            f"max_tokens={max_tokens} must be positive. "
+            "Set --max-tokens or $CODE_REVIEW_MAX_TOKENS to a positive integer."
+        )
+
+    # Resolve safety context: --no-context wins, then explicit --context,
+    # then env, then default. Empty string from env is treated as "use
+    # default" rather than "disabled" -- pass --no-context explicitly to
+    # disable, since an env value of "" is more likely a misconfig than
+    # intent.
+    if args.no_context:
+        context: str | None = None
+    elif args.context is not None:
+        context = args.context
+    else:
+        context = os.getenv("CODE_REVIEW_CONTEXT") or DEFAULT_CONTEXT
+
+    # Resolve and validate provider-specific config before touching git /
+    # GitHub so the user fails fast on configuration errors. Cloud
+    # providers need an API key; Ollama is local so it needs a server
+    # URL and (optionally) a longer timeout instead.
+    api_key: str | None = None
+    ollama_host: str | None = None
+    ollama_timeout: float | None = None
+
+    if args.provider == "openrouter":
+        api_key = os.getenv("OPENROUTER_API_KEY")
+        if not api_key:
+            raise ConfigError(
+                "OPENROUTER_API_KEY not set. Copy .env.example to "
+                f".env at {ROOT} and fill in your key, or rerun with "
+                "--provider gemini (Google AI Studio key) or "
+                "--provider ollama (local server, no key needed)."
+            )
+    elif args.provider == "gemini":
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ConfigError(
+                "GEMINI_API_KEY not set. Copy .env.example to .env "
+                f"at {ROOT} and fill in your Google AI Studio key, or "
+                "rerun with --provider openrouter (OpenRouter key) or "
+                "--provider ollama (local server, no key needed)."
+            )
+    else:  # ollama
+        # No API key for local provider. Resolve host + timeout from
+        # CLI / env / default. Validation of the host URL is implicit
+        # in the call -- a bad URL produces a typed ConnectError-derived
+        # ConfigError from ``call_ollama`` with a clear message.
+        ollama_host = (
+            args.ollama_host
+            or os.getenv("OLLAMA_HOST")
+            or DEFAULT_OLLAMA_HOST
+        )
+        env_timeout = os.getenv("OLLAMA_TIMEOUT")
+        try:
+            ollama_timeout = (
+                float(env_timeout) if env_timeout is not None
+                else DEFAULT_OLLAMA_TIMEOUT
+            )
+        except ValueError as exc:
+            raise ConfigError(
+                f"$OLLAMA_TIMEOUT={env_timeout!r} is not a valid float "
+                "(seconds). Unset it to use the default "
+                f"({DEFAULT_OLLAMA_TIMEOUT}s) or set it to a positive "
+                "number of seconds."
+            ) from exc
+        if ollama_timeout <= 0:
+            raise ConfigError(
+                f"$OLLAMA_TIMEOUT={ollama_timeout} must be positive "
+                "(seconds)."
+            )
+
+    # Build the payload for the chosen mode. ``--include`` / ``--exclude``
+    # are codebase-mode-only -- warn if they show up alongside a diff
+    # mode rather than silently dropping them.
+    if args.codebase:
+        files = gather_codebase_files(args.include, args.exclude)
+        if not files:
+            sys.stderr.write(
+                "No files matched after --include / --exclude / built-in "
+                "filters. Nothing to review.\n"
+            )
+            sys.exit(0)
+        bundle = bundle_codebase(files)
+        if len(bundle) > MAX_BUNDLE_CHARS:
+            # Show the 10 largest files so the user can target
+            # ``--exclude`` flags effectively rather than guessing.
+            # We re-stat in this branch rather than threading the
+            # sizes through ``gather_codebase_files``'s return type:
+            # this is a cold error path (only fires when the bundle
+            # exceeds the cap), so the redundant syscalls don't matter,
+            # and the alternative -- returning ``list[tuple[Path, int]]``
+            # from a function that 99% of callers only need ``list[Path]``
+            # from -- is a worse signature for a non-hot-path saving.
+            # Skip any file that disappeared between ``bundle_codebase``
+            # and now (narrow race window but possible on a busy CI box)
+            # so the error path doesn't itself crash with an
+            # ``OSError`` and bury the original ContextOverflow message.
+            def _safe_stat(p: Path) -> tuple[Path, int] | None:
+                try:
+                    return (p, p.stat().st_size)
+                except OSError:
+                    return None
+            sized_pairs = [pair for p in files if (pair := _safe_stat(p))]
+            sized = sorted(sized_pairs, key=lambda x: x[1], reverse=True)
+            largest = "\n".join(
+                f"  {_format_size(size):>10}  {path.as_posix()}"
+                for path, size in sized[:10]
+            )
+            raise ContextOverflow(
+                f"Codebase bundle is {len(bundle):,} chars "
+                f"(limit {MAX_BUNDLE_CHARS:,}). Narrow with --include "
+                "or --exclude.",
+                detail="Largest files in current selection:\n" + largest,
+                model=model,
+                provider=args.provider,
+            )
+        sys.stderr.write(
+            f"Reviewing {len(files)} file(s) ({len(bundle):,} chars) "
+            f"with `{model}` via {args.provider} "
+            f"(T={temperature}, max_tokens={max_tokens})...\n"
+        )
+        system_prompt, user_prompt = build_codebase_prompts(bundle, context)
+    else:
+        if args.include or args.exclude:
+            sys.stderr.write(
+                "WARN: --include / --exclude are ignored outside "
+                "--codebase mode.\n"
+            )
+        if args.pr:
+            diff = pr_diff(args.pr)
+        else:
+            diff = git_diff_local(args.base, args.staged)
+        if not diff.strip():
+            sys.stderr.write("No diff found. Nothing to review.\n")
+            sys.exit(0)
+        sys.stderr.write(
+            f"Reviewing {len(diff):,}-char diff with `{model}` via "
+            f"{args.provider} (T={temperature}, max_tokens={max_tokens})...\n"
+        )
+        system_prompt, user_prompt = build_diff_prompts(diff, context)
+
+    # Wire-format dispatch. All three providers take the same (system,
+    # user) prompt pair; only the request shape differs.
+    # ``_retry_on_recoverable`` wraps each call so a single provider
+    # hiccup or 5xx is absorbed automatically; other typed errors
+    # (safety, rate limit, context overflow, config) surface immediately
+    # for the caller to handle.
+    if args.provider == "openrouter":
+        # mypy: api_key is non-None here -- the config-check block above
+        # raises if OPENROUTER_API_KEY is missing.
+        assert api_key is not None
+        referer = os.getenv(
+            "OPENROUTER_HTTP_REFERER",
+            "https://github.com/Airwhale/local-gemini-code-review",
+        )
+        title = os.getenv("OPENROUTER_X_TITLE", "OpenRouter Code Review")
+        output = _retry_on_recoverable(
+            lambda: call_openrouter(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                model=model,
+                api_key=api_key,
+                referer=referer,
+                title=title,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            ),
+            label="openrouter",
+        )
+    elif args.provider == "gemini":
+        assert api_key is not None
+        output = _retry_on_recoverable(
+            lambda: call_gemini(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                model=model,
+                api_key=api_key,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            ),
+            label="gemini",
+        )
+    else:  # ollama
+        # ollama_host and ollama_timeout are non-None here -- the
+        # config-check block above sets defaults if they weren't
+        # provided. The assertions document that for readers and
+        # narrow the type for the lambda closure.
+        assert ollama_host is not None
+        assert ollama_timeout is not None
+        output = _retry_on_recoverable(
+            lambda: call_ollama(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                model=model,
+                host=ollama_host,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                timeout=ollama_timeout,
+            ),
+            label="ollama",
+        )
+    print(output)
+
+
+def _entrypoint() -> None:
+    """Top-level entry that maps typed errors to exit codes.
+
+    Keeping the try/except out of ``main`` itself means ``main`` can be
+    imported and unit-tested without the process-exit side effect.
+    """
+    try:
+        main()
+    except ReviewError as err:
+        _print_error(err)
+        sys.exit(err.exit_code)
+
+
+if __name__ == "__main__":
+    _entrypoint()
diff --git a/skills/code-review-codebase/SKILL.md b/skills/code-review-codebase/SKILL.md
new file mode 100644
index 0000000..70dbf82
--- /dev/null
+++ b/skills/code-review-codebase/SKILL.md
@@ -0,0 +1,63 @@
+---
+name: code-review-codebase
+description: Guidelines, persona, and critical constraints for performing whole-codebase code reviews on a bundle of files (not a diff). Use this skill when performing a /codebase-review command.
+user-invokable: false
+---
+
+# Code Review (Whole Codebase)
+
+## PERSONA
+
+You are a very experienced **Principal Software Engineer** and a meticulous **Code Review Architect**. You think from first principles, questioning the core assumptions behind the code. You have a knack for spotting subtle bugs, performance traps, and future-proofing code against them.
+
+## OBJECTIVE
+
+Your task is to deeply understand the **intent and structure** of a bundle of source files (the entire tracked codebase, or a filtered subset) and perform a **thorough, actionable, and objective** review.
+Your primary goal is to **identify potential bugs, security vulnerabilities, performance bottlenecks, and clarity issues** in the code as it stands today — without reference to a diff.
+Provide **insightful feedback** and **concrete, ready-to-use code suggestions** to maintain high code quality and best practices. Prioritize substantive feedback on logic, correctness, and maintainability over stylistic nits.
+
+## Bundle format
+
+The user prompt contains the codebase as a series of files concatenated together. Each file begins with a delimiter of the exact form:
+
+```
+======== FILE: <path/to/file> ========
+```
+
+The lines between two delimiters are the file's contents. **Each content line is prefixed with its 1-indexed line number followed by ``: ``** (e.g. ``   808: def _markdown_to_html(source: str) -> str:``). When you reference a line in your findings, use that prefix value verbatim as the line number — do not count lines yourself, just transcribe the prefix from the line you are commenting on. The line number is the position in that specific file's content (1-indexed at the line immediately following the delimiter), not a position in the overall bundle.
+
+## Instructions
+
+1. **Summarize the codebase**: Before looking for issues, articulate in one or two sentences what the codebase appears to do and how it is structured. Use this understanding to frame the review.
+2. **Establish context** by reading multiple files together. Cross-file relationships (an importer and its importee, a producer and its consumer, a type and its usage) are often where the real bugs are.
+3. **Prioritize Analysis Focus**: Concentrate your deepest analysis on application code (non-test files). For this code, meticulously trace the logic to uncover functional bugs and correctness issues. Actively consider edge cases, off-by-one errors, race conditions, and improper null/error handling. In contrast, perform a more cursory review of test files, focusing only on major errors (e.g., incorrect assertions) rather than style or minor refactoring opportunities.
+4. **Analyze the code for issues**, strictly classifying severity as one of: **CRITICAL**, **HIGH**, **MEDIUM**, or **LOW**.
+
+## Critical Constraints
+
+**STRICTLY follow these rules for review comments:**
+
+* **Location:** You **MAY** comment on any line in any file in the provided bundle. Reference each comment by the file path (as shown in the `======== FILE: <path> ========` delimiter) and the 1-indexed line number within that file. Pay close attention to file boundaries — a comment must reference the file in which the code actually appears, not an adjacent file in the bundle.
+* **Relevance:** You **MUST** only add a review comment if there is a demonstrable **BUG**, **ISSUE**, or a significant **OPPORTUNITY FOR IMPROVEMENT**.
+* **Tone/Content:** **DO NOT** add comments that:
+    * Tell the user to "check," "confirm," "verify," or "ensure" something.
+    * Explain what the code does or validate its purpose.
+    * Explain the code to the author (they are assumed to know their own code).
+    * Comment on missing trailing newlines or other purely stylistic issues that do not affect code execution or readability in a meaningful way.
+* **Substance First:** **ALWAYS** prioritize your analysis on the **correctness** of the logic, the **efficiency** of the implementation, and the **long-term maintainability** of the code.
+* **Technical Detail:**
+    * Pay **meticulous attention to line numbers and indentation** in code suggestions; they **must** be correct and match the surrounding code in the referenced file.
+    * **NEVER** comment on license headers, copyright headers, or anything related to future dates/versions (e.g., "this date is in the future").
+* **Formatting/Structure:**
+    * Keep the **codebase summary** concise (aim for a single sentence).
+    * Keep **comment bodies concise** and focused on a single issue.
+    * If a similar issue exists in **multiple locations** (whether within one file or across several), state it once and list the other locations instead of repeating the full comment.
+    * **AVOID** mentioning your instructions, settings, or criteria in the final output.
+
+**Severity Guidelines (for consistent classification):**
+
+* **Functional correctness bugs that lead to behavior contrary to the code's apparent intent should generally be classified as HIGH or CRITICAL.**
+* **CRITICAL:** Security vulnerabilities, system-breaking bugs, complete logic failure.
+* **HIGH:** Performance bottlenecks (e.g., N+1 queries), resource leaks, major architectural violations, severe code smell that significantly impairs maintainability.
+* **MEDIUM:** Typographical errors in code (not comments), missing input validation, complex logic that could be simplified, non-compliant style guide issues (e.g., wrong naming convention).
+* **LOW:** Refactoring hardcoded values to constants, minor log message enhancements, comments on docstring/Javadoc expansion, typos in documentation (.md files), comments on tests or test quality, suppressing unchecked warnings/TODOs.
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..1113e65
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,104 @@
+version = 1
+revision = 3
+requires-python = ">=3.11"
+
+[[package]]
+name = "anyio"
+version = "4.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" },
+]
+
+[[package]]
+name = "certifi"
+version = "2026.4.22"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077, upload-time = "2026-04-22T11:26:11.191Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707, upload-time = "2026-04-22T11:26:09.372Z" },
+]
+
+[[package]]
+name = "code-review-openrouter"
+version = "0.1.0"
+source = { virtual = "." }
+dependencies = [
+    { name = "httpx" },
+    { name = "python-dotenv" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "httpx", specifier = ">=0.27" },
+    { name = "python-dotenv", specifier = ">=1.0" },
+]
+
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
+]
+
+[[package]]
+name = "idna"
+version = "3.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245, upload-time = "2026-05-12T22:45:57.011Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" },
+]
+
+[[package]]
+name = "python-dotenv"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]