diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 804499b..cfada42 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -57,9 +57,6 @@ jobs:
       - name: Smoke test production CLI and MCP
         run: pnpm smoke
 
-      - name: Verify generated dist is committed
-        run: git diff --exit-code -- packages/ragmir-core/dist packages/ragmir-tts/dist
-
       - name: Verify npm package metadata
         run: pnpm package:check
 
diff --git a/.gitignore b/.gitignore
index e527bf9..cd3c341 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,10 @@ private/**
 *.tgz
 release-artifacts/
 *.pid
+*dist
 
+packages/ragmir-core/dist/
+packages/ragmir-tts/dist/
 packages/ragmir-app/dist/
 packages/ragmir-app/src-tauri/target/
 packages/ragmir-app/src-tauri/gen/
diff --git a/AGENTS.md b/AGENTS.md
index 6a14b93..fa8aea6 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -187,9 +187,10 @@
 - `packages/ragmir-core/examples/library-api-demo` is the local library-API smoke (`pnpm example`). It
   `import`s `@jcode.labs/ragmir` via Node self-referencing so it always exercises the local
   `packages/ragmir-core/dist` build, never the npm-published package, and it reuses the
-  `sovereign-rag-demo` synthetic corpus rather than adding a second one. Testing local changes must
-  use this local build (or `node packages/ragmir-core/dist/cli.js`), not `npx ragmir`, which would
-  resolve the released npm version.
+  `sovereign-rag-demo` synthetic corpus rather than adding a second one. `dist/` is a gitignored build
+  output: build it first with `pnpm build` (or run `pnpm example`, which builds first), then run
+  `node packages/ragmir-core/dist/cli.js`. Never use `npx ragmir`, which would resolve the released npm
+  version.
 - Use Context7 before changing dependencies or public APIs that rely on external libraries.
 - Run `pnpm validate` before opening a release pull request or publishing. It covers
   Biome, dependency security audit, TypeScript, Vitest, build output, production CLI/MCP smoke
@@ -234,7 +235,8 @@ General principles (KISS, DRY, YAGNI, SOLID) as applied in this codebase. Match
   `openRowsTable`, `redactText`, `supportedExtensions`, `recordAccess`); extract instead of copying.
   `embedText` delegating to `embedTexts` is the reference pattern.
 - No dead or obsolete code. Delete replaced code, unused exports, and commented-out blocks in the
-  same change; a deletion must cover both source and the regenerated package `dist/`.
+  same change. `dist/` is gitignored build output: regenerate it locally with `pnpm build` before
+  running CLI/MCP smoke or the library-API demo, but do not commit it.
 - No magic strings or numbers. Name meaningful literals as constants, and put shared paths, provider
   defaults, and ignore constants in `packages/ragmir-core/src/defaults.ts` rather than copying them across
   modules.
diff --git a/CLAUDE.md b/CLAUDE.md
index 9d09b69..4faf194 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -40,13 +40,15 @@ Run only the TTS package tests: `pnpm --filter @jcode.labs/ragmir-tts test`
 
 Tests are colocated as `packages/*/src/*.test.ts` and run on the TypeScript sources.
 
-## Committed `dist/` — critical
-
-`packages/ragmir-core/dist/` and `packages/ragmir-tts/dist/` are checked into Git. CI enforces
-`git diff --exit-code -- packages/ragmir-core/dist packages/ragmir-tts/dist`. After any change under
-`packages/ragmir-core/src/` or `packages/ragmir-tts/src/`, run `pnpm build` and commit the regenerated
-output in the same commit, or CI fails. This is the single easiest mistake to make in this repo.
-`packages/ragmir-app/dist/` and `packages/ragmir-landing/dist/` are build artifacts and stay ignored.
+## `dist/` is gitignored build output — critical
+
+All `packages/*/dist/` directories (`ragmir-core`, `ragmir-tts`, `ragmir-app`, `ragmir-landing`,
+`ragmir-license-webhook`) are gitignored build output and are NOT checked into Git. Build them locally
+with `pnpm build` before running the CLI, MCP smoke, the library-API demo, or `pnpm validate`. CI
+rebuilds `dist/` from source in the `Build` step before smoke tests, and the release pipeline rebuilds
+it again (`scripts/semantic-release-prepare.mjs` runs `pnpm --filter @jcode.labs/ragmir build`) before
+`pnpm pack`/`publish`, so the published npm tarball always contains freshly built output. Never commit
+`dist/`; a clean clone has none until `pnpm build` runs.
 
 ## Naming map (the package has several names on purpose)
 
diff --git a/README.md b/README.md
index 9ed3335..06c09d7 100644
--- a/README.md
+++ b/README.md
@@ -994,8 +994,9 @@ retrieval mode, so they run without downloading an embedding or chat model, and
 documents.
 
 > Testing local changes: use the repository's own build, not `npx`. Inside this repo `npx ragmir`
-> resolves to the **published** npm package, not your working copy — so it would not exercise your
-> local edits. The examples below run the local `dist/` build instead.
+> resolves to the **published** npm package, not your working copy—so it would not exercise your
+> local edits. Build once with `pnpm build`, then run the examples against the local `dist/` build.
+> (`dist/` is gitignored build output, so a clean clone has none until `pnpm build` runs.)
 
 ### CLI workspace (`sovereign-rag-demo`)
 
@@ -1048,16 +1049,18 @@ pnpm --filter @jcode.labs/ragmir build
 pnpm --filter @jcode.labs/ragmir-tts build
 ```
 
-`packages/ragmir-core/dist/` and `packages/ragmir-tts/dist/` are committed. `packages/ragmir-app/dist/`
-and `packages/ragmir-landing/dist/` are ignored build artifacts. After changing TypeScript sources in
-published packages, run:
+All `packages/*/dist/` directories (`ragmir-core`, `ragmir-tts`, `ragmir-app`, `ragmir-landing`) are
+gitignored build output — they are not checked into Git. After changing TypeScript sources in published
+packages, build and validate locally:
 
 ```bash
 pnpm build
 pnpm validate
 ```
 
-CI checks that generated `dist/` files match the source.
+CI rebuilds `dist/` from source before smoke tests, and the release pipeline rebuilds it again before
+`pnpm pack`/`publish`, so the published tarball always carries fresh output. A clean clone has no
+`dist/` until `pnpm build` runs.
 
 The root package is private and only orchestrates workspace tasks. npm publishing is handled by the
 protected `Release npm` GitHub Actions workflow on `main`. semantic-release derives the version from
diff --git a/packages/ragmir-core/examples/library-api-demo/README.md b/packages/ragmir-core/examples/library-api-demo/README.md
index fa9ae2d..60855a3 100644
--- a/packages/ragmir-core/examples/library-api-demo/README.md
+++ b/packages/ragmir-core/examples/library-api-demo/README.md
@@ -25,7 +25,8 @@ That builds Ragmir Core, then runs the demo. It reuses the committed synthetic c
 `sovereign-rag-demo`, so it needs no private documents and writes only to that example's gitignored
 `.ragmir/storage`.
 
-To run it directly against the already-built `dist/` without rebuilding:
+To run it directly against the already-built `dist/` without rebuilding (run `pnpm build` first;
+`dist/` is gitignored, so a clean clone has none):
 
 ```bash
 node packages/ragmir-core/examples/library-api-demo/run.mjs
diff --git a/packages/ragmir-core/src/access-log.test.ts b/packages/ragmir-core/src/access-log.test.ts
index 50b914f..25884e7 100644
--- a/packages/ragmir-core/src/access-log.test.ts
+++ b/packages/ragmir-core/src/access-log.test.ts
@@ -1,4 +1,4 @@
-import { appendFile, mkdtemp, rm } from "node:fs/promises"
+import { appendFile, mkdtemp, rm, stat, writeFile } from "node:fs/promises"
 import os from "node:os"
 import path from "node:path"
 import { afterEach, describe, expect, it } from "vitest"
@@ -59,3 +59,41 @@ describe("accessLogUsageReport", () => {
     )
   })
 })
+
+describe("recordAccess retention", () => {
+  it("trims the access log when it exceeds the size cap, keeping the most recent lines", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-access-log-trim-"))
+    tempDirs.push(root)
+    await initProject(root)
+    const config = await loadConfig(root)
+
+    // Pre-grow the log past the 10 MB cap with filler lines, then append one
+    // real event. The retention trim must shrink the file well below the cap
+    // while keeping the newest event intact.
+    const fillerLine = '{"action":"ingest","timestamp":"2024-01-01T00:00:00.000Z"}'
+    const fillerCount = Math.ceil((11 * 1024 * 1024) / (fillerLine.length + 1))
+    const filler = `${Array(fillerCount).fill(fillerLine).join("\n")}\n`
+    await writeFile(config.accessLogPath, filler, "utf8")
+    const sizeBefore = (await stat(config.accessLogPath)).size
+    expect(sizeBefore).toBeGreaterThan(10 * 1024 * 1024)
+
+    await recordAccess(config, { action: "search", resultCount: 1 })
+
+    const sizeAfter = (await stat(config.accessLogPath)).size
+    expect(sizeAfter).toBeLessThan(sizeBefore)
+    expect(sizeAfter).toBeLessThan(10 * 1024 * 1024)
+  })
+
+  it("does not write a log entry when access logging is disabled", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-access-log-disabled-"))
+    tempDirs.push(root)
+    await initProject(root)
+    const config = await loadConfig(root)
+    const disabledConfig = { ...config, accessLog: false }
+
+    await recordAccess(disabledConfig, { action: "search", resultCount: 1 })
+
+    // No log file should exist because the disabled path returns before any append.
+    await expect(stat(config.accessLogPath)).rejects.toThrow("ENOENT")
+  })
+})
diff --git a/packages/ragmir-core/src/access-log.ts b/packages/ragmir-core/src/access-log.ts
index 4421017..43fe08a 100644
--- a/packages/ragmir-core/src/access-log.ts
+++ b/packages/ragmir-core/src/access-log.ts
@@ -1,6 +1,6 @@
 import { createHash } from "node:crypto"
 import { existsSync } from "node:fs"
-import { appendFile, mkdir, readFile } from "node:fs/promises"
+import { appendFile, mkdir, readFile, stat, writeFile } from "node:fs/promises"
 import path from "node:path"
 import { loadConfig } from "./config.js"
 import type {
@@ -36,6 +36,14 @@ const ACCESS_LOG_ACTIONS: AccessLogAction[] = [
 const ACCESS_LOG_ACTION_SET = new Set<string>(ACCESS_LOG_ACTIONS)
 const DEFAULT_USAGE_REPORT_DAYS = 7
 const MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000
+/**
+ * Soft cap above which the access log is trimmed to its most recent lines.
+ * Keeps the file bounded so long-lived installations (MCP server) do not grow
+ * it without limit, and so usage reports do not load an unbounded file.
+ */
+const MAX_ACCESS_LOG_BYTES = 10 * 1024 * 1024
+/** Number of most recent lines retained when the log exceeds the byte cap. */
+const TRIMMED_ACCESS_LOG_LINES = 50_000
 
 export async function recordAccess(config: Config, event: AccessLogEvent): Promise<void> {
   if (!config.accessLog) {
@@ -44,12 +52,38 @@ export async function recordAccess(config: Config, event: AccessLogEvent): Promi
 
   try {
     await mkdir(path.dirname(config.accessLogPath), { recursive: true })
+    await trimAccessLogIfNeeded(config.accessLogPath)
     await appendFile(config.accessLogPath, `${JSON.stringify(toLogLine(event))}\n`, "utf8")
   } catch {
     // Access logging is best-effort so read-only workspaces do not block local use.
   }
 }
 
+async function trimAccessLogIfNeeded(accessLogPath: string): Promise<void> {
+  let size = 0
+  try {
+    size = (await stat(accessLogPath)).size
+  } catch (error) {
+    if (isNodeError(error) && error.code === "ENOENT") {
+      return
+    }
+    throw error
+  }
+
+  if (size <= MAX_ACCESS_LOG_BYTES) {
+    return
+  }
+
+  const content = await readFile(accessLogPath, "utf8")
+  const lines = content.split("\n").filter((line) => line.length > 0)
+  const kept = lines.slice(Math.max(0, lines.length - TRIMMED_ACCESS_LOG_LINES))
+  await writeFile(accessLogPath, `${kept.join("\n")}\n`, "utf8")
+}
+
+function isNodeError(error: unknown): error is NodeJS.ErrnoException {
+  return error instanceof Error && "code" in error
+}
+
 export async function accessLogUsageReport(
   options: AccessLogUsageOptions = {},
 ): Promise<AccessLogUsageReport> {
diff --git a/packages/ragmir-core/src/cli-options.ts b/packages/ragmir-core/src/cli-options.ts
new file mode 100644
index 0000000..5baeb24
--- /dev/null
+++ b/packages/ragmir-core/src/cli-options.ts
@@ -0,0 +1,121 @@
+import { isTtsLanguage, TTS_LANGUAGES, type TtsLanguage } from "@jcode.labs/ragmir-tts"
+import type { AgentInstallMode, AgentInstallScope } from "./skill.js"
+
+/**
+ * Pure option-parsing and validation helpers for the Ragmir CLI. Kept separate
+ * from `cli.ts` (which wires Commander and side effects) so they can be unit
+ * tested without spawning a process or importing commander.
+ *
+ * Each helper either returns the validated value or throws an Error with a
+ * user-facing message; the CLI surfaces the message in red on stderr.
+ */
+
+export type AudioEngine = "auto" | "edge" | "transformers"
+
+export interface AudioOptions {
+  out?: string
+  engine?: string
+  lang?: string
+  offline?: boolean
+  allowRemoteModels?: boolean
+}
+
+/** Parse and validate a positive integer CLI argument. */
+export function parsePositiveInt(value: string): number {
+  const parsed = Number(value)
+  // Use Number() (not parseInt) so fractional input like "1.5" is rejected
+  // instead of silently truncating to 1, and so non-numeric strings become NaN.
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    throw new Error("Expected a positive integer.")
+  }
+  return parsed
+}
+
+/** Parse and validate a finite number CLI argument. */
+export function parseNumber(value: string): number {
+  const parsed = Number.parseFloat(value)
+  if (!Number.isFinite(parsed)) {
+    throw new Error("Expected a number.")
+  }
+  return parsed
+}
+
+/** Parse and validate a recall threshold CLI argument in the inclusive range 0..1. */
+export function parseRecallThreshold(value: string): number {
+  const trimmed = value.trim()
+  const parsed = Number(trimmed)
+  if (trimmed.length === 0 || !Number.isFinite(parsed) || parsed < 0 || parsed > 1) {
+    throw new Error("Expected a recall threshold between 0 and 1.")
+  }
+  return parsed
+}
+
+/**
+ * Resolve the `allowRemoteModels` flag from audio options. Offline mode forces
+ * remote model loading off; an explicit opt-in enables it; otherwise undefined
+ * lets the TTS package apply its own offline-by-default behaviour.
+ */
+export function audioAllowRemoteModels(options: AudioOptions): boolean | undefined {
+  if (options.offline) {
+    return false
+  }
+  if (options.allowRemoteModels) {
+    return true
+  }
+  return undefined
+}
+
+/**
+ * Resolve the spoken language from `--lang`. Throws on an unsupported value so
+ * the operator is told which languages are available.
+ */
+export function audioLanguage(options: AudioOptions): TtsLanguage | undefined {
+  if (options.lang === undefined) {
+    return undefined
+  }
+  if (isTtsLanguage(options.lang)) {
+    return options.lang
+  }
+  throw new Error(`Expected --lang to be one of: ${TTS_LANGUAGES.join(", ")}.`)
+}
+
+/**
+ * Resolve the TTS engine from audio options. Offline always forces the local
+ * Transformers.js renderer. An MP3 output without an explicit `--engine edge`
+ * is rejected as a confidentiality guard: MP3 requires the online Edge TTS
+ * service, so the operator must opt in knowingly rather than leak narration
+ * text by accident.
+ */
+export function audioEngine(options: AudioOptions): AudioEngine {
+  if (options.offline) {
+    return "transformers"
+  }
+  if (options.engine === undefined) {
+    if (options.out?.toLowerCase().endsWith(".mp3")) {
+      throw new Error(
+        "MP3 output uses online Edge TTS. Re-run with `--engine edge` only when sending narration text to Edge TTS is acceptable.",
+      )
+    }
+    return "transformers"
+  }
+  if (options.engine === "auto" || options.engine === "edge" || options.engine === "transformers") {
+    return options.engine
+  }
+  throw new Error("Expected --engine to be auto, edge, or transformers.")
+}
+
+/** Parse and validate the `--scope` agent-install argument. */
+export function parseAgentInstallScope(value: string | undefined): AgentInstallScope {
+  if (value === "project" || value === "user") {
+    return value
+  }
+  throw new Error("Expected --scope to be project or user.")
+}
+
+/** Parse and validate the `--mode` agent-install argument. */
+export function parseAgentInstallMode(value: string | undefined): AgentInstallMode {
+  if (value === "link" || value === "copy") {
+    return value
+  }
+  throw new Error("Expected --mode to be link or copy.")
+}
diff --git a/packages/ragmir-core/src/cli.test.ts b/packages/ragmir-core/src/cli.test.ts
new file mode 100644
index 0000000..30e5f61
--- /dev/null
+++ b/packages/ragmir-core/src/cli.test.ts
@@ -0,0 +1,147 @@
+import { describe, expect, it } from "vitest"
+import {
+  audioAllowRemoteModels,
+  audioEngine,
+  audioLanguage,
+  parseAgentInstallMode,
+  parseAgentInstallScope,
+  parseNumber,
+  parsePositiveInt,
+  parseRecallThreshold,
+} from "./cli-options.js"
+
+describe("parsePositiveInt", () => {
+  it("parses a positive integer", () => {
+    expect(parsePositiveInt("1")).toBe(1)
+    expect(parsePositiveInt("42")).toBe(42)
+  })
+
+  it("rejects zero, negatives, and non-integers", () => {
+    expect(() => parsePositiveInt("0")).toThrow("positive integer")
+    expect(() => parsePositiveInt("-3")).toThrow("positive integer")
+    expect(() => parsePositiveInt("1.5")).toThrow("positive integer")
+    expect(() => parsePositiveInt("abc")).toThrow("positive integer")
+    expect(() => parsePositiveInt("")).toThrow("positive integer")
+  })
+})
+
+describe("parseNumber", () => {
+  it("parses finite numbers including negatives and decimals", () => {
+    expect(parseNumber("1")).toBe(1)
+    expect(parseNumber("-2.5")).toBe(-2.5)
+    expect(parseNumber("0")).toBe(0)
+  })
+
+  it("rejects non-finite values", () => {
+    expect(() => parseNumber("abc")).toThrow("Expected a number")
+    expect(() => parseNumber("")).toThrow("Expected a number")
+  })
+})
+
+describe("parseRecallThreshold", () => {
+  it("accepts values in the inclusive 0..1 range", () => {
+    expect(parseRecallThreshold("0")).toBe(0)
+    expect(parseRecallThreshold("1")).toBe(1)
+    expect(parseRecallThreshold("0.75")).toBe(0.75)
+  })
+
+  it("trims surrounding whitespace before parsing", () => {
+    expect(parseRecallThreshold("  0.5  ")).toBe(0.5)
+  })
+
+  it("rejects values outside the range and non-numbers", () => {
+    expect(() => parseRecallThreshold("-0.1")).toThrow("between 0 and 1")
+    expect(() => parseRecallThreshold("1.1")).toThrow("between 0 and 1")
+    expect(() => parseRecallThreshold("abc")).toThrow("between 0 and 1")
+    expect(() => parseRecallThreshold("   ")).toThrow("between 0 and 1")
+  })
+})
+
+describe("audioAllowRemoteModels", () => {
+  it("forces remote models off when offline", () => {
+    expect(audioAllowRemoteModels({ offline: true, allowRemoteModels: true })).toBe(false)
+  })
+
+  it("enables remote models on explicit opt-in", () => {
+    expect(audioAllowRemoteModels({ allowRemoteModels: true })).toBe(true)
+  })
+
+  it("returns undefined by default to defer to the TTS package default", () => {
+    expect(audioAllowRemoteModels({})).toBeUndefined()
+  })
+})
+
+describe("audioLanguage", () => {
+  it("returns undefined when no language is provided", () => {
+    expect(audioLanguage({})).toBeUndefined()
+  })
+
+  it("accepts a supported language", () => {
+    expect(audioLanguage({ lang: "fr" })).toBe("fr")
+    expect(audioLanguage({ lang: "en" })).toBe("en")
+    expect(audioLanguage({ lang: "es" })).toBe("es")
+  })
+
+  it("rejects an unsupported language with the list of valid options", () => {
+    expect(() => audioLanguage({ lang: "de" })).toThrow("en, es, fr")
+  })
+})
+
+describe("audioEngine", () => {
+  it("forces the transformers engine when offline", () => {
+    expect(audioEngine({ offline: true })).toBe("transformers")
+    expect(audioEngine({ offline: true, engine: "edge" })).toBe("transformers")
+  })
+
+  it("defaults to transformers when no engine is given", () => {
+    expect(audioEngine({})).toBe("transformers")
+  })
+
+  it("accepts explicit engine choices", () => {
+    expect(audioEngine({ engine: "edge" })).toBe("edge")
+    expect(audioEngine({ engine: "auto" })).toBe("auto")
+    expect(audioEngine({ engine: "transformers" })).toBe("transformers")
+  })
+
+  it("guards MP3 output without an explicit engine (confidentiality check)", () => {
+    expect(() => audioEngine({ out: "summary.mp3" })).toThrow("MP3 output uses online Edge TTS")
+  })
+
+  it("allows MP3 output when edge engine is explicit", () => {
+    expect(audioEngine({ out: "summary.mp3", engine: "edge" })).toBe("edge")
+  })
+
+  it("rejects an invalid engine value", () => {
+    expect(() => audioEngine({ engine: "piper" })).toThrow("auto, edge, or transformers")
+  })
+})
+
+describe("parseAgentInstallScope", () => {
+  it("accepts project and user scopes", () => {
+    expect(parseAgentInstallScope("project")).toBe("project")
+    expect(parseAgentInstallScope("user")).toBe("user")
+  })
+
+  it("returns project by default when no value is given", () => {
+    // Note: the CLI passes undefined when --scope is omitted; the default is
+    // applied at the Commander level, but the parser itself accepts undefined
+    // only as an error here (the default wiring in cli.ts guarantees a value).
+    expect(() => parseAgentInstallScope(undefined)).toThrow("project or user")
+  })
+
+  it("rejects unknown scopes", () => {
+    expect(() => parseAgentInstallScope("global")).toThrow("project or user")
+  })
+})
+
+describe("parseAgentInstallMode", () => {
+  it("accepts link and copy modes", () => {
+    expect(parseAgentInstallMode("link")).toBe("link")
+    expect(parseAgentInstallMode("copy")).toBe("copy")
+  })
+
+  it("rejects unknown modes", () => {
+    expect(() => parseAgentInstallMode("move")).toThrow("link or copy")
+    expect(() => parseAgentInstallMode(undefined)).toThrow("link or copy")
+  })
+})
diff --git a/packages/ragmir-core/src/cli.ts b/packages/ragmir-core/src/cli.ts
index 5eb95ad..e215725 100644
--- a/packages/ragmir-core/src/cli.ts
+++ b/packages/ragmir-core/src/cli.ts
@@ -1,9 +1,19 @@
 #!/usr/bin/env node
 import path from "node:path"
-import { isTtsLanguage, TTS_LANGUAGES, type TtsLanguage } from "@jcode.labs/ragmir-tts"
+import type { TtsLanguage } from "@jcode.labs/ragmir-tts"
 import { Command } from "commander"
 import pc from "picocolors"
 import { accessLogUsageReport } from "./access-log.js"
+import {
+  audioAllowRemoteModels,
+  audioEngine,
+  audioLanguage,
+  parseAgentInstallMode,
+  parseAgentInstallScope,
+  parseNumber,
+  parsePositiveInt,
+  parseRecallThreshold,
+} from "./cli-options.js"
 import { loadConfig } from "./config.js"
 import { DEFAULT_SKILL_TARGET_DIR } from "./defaults.js"
 import { destroyIndex } from "./destroy.js"
@@ -21,8 +31,6 @@ import { securityAudit } from "./security.js"
 import { enableSemanticEmbeddings } from "./semantic-config.js"
 import { setupProject } from "./setup.js"
 import {
-  type AgentInstallMode,
-  type AgentInstallScope,
   bundledSkillPath,
   installAgentSkills,
   installSkill,
@@ -854,31 +862,6 @@ try {
   process.exitCode = 1
 }
 
-function parsePositiveInt(value: string): number {
-  const parsed = Number.parseInt(value, 10)
-  if (!Number.isInteger(parsed) || parsed <= 0) {
-    throw new Error("Expected a positive integer.")
-  }
-  return parsed
-}
-
-function parseRecallThreshold(value: string): number {
-  const trimmed = value.trim()
-  const parsed = Number(trimmed)
-  if (trimmed.length === 0 || !Number.isFinite(parsed) || parsed < 0 || parsed > 1) {
-    throw new Error("Expected a recall threshold between 0 and 1.")
-  }
-  return parsed
-}
-
-function parseNumber(value: string): number {
-  const parsed = Number.parseFloat(value)
-  if (!Number.isFinite(parsed)) {
-    throw new Error("Expected a number.")
-  }
-  return parsed
-}
-
 function collectOptionValue(value: string, previous: string[]): string[] {
   return [...previous, value]
 }
@@ -955,58 +938,6 @@ function isTtsModule(value: unknown): value is TtsModule {
   )
 }
 
-function audioAllowRemoteModels(options: AudioOptions): boolean | undefined {
-  if (options.offline) {
-    return false
-  }
-  if (options.allowRemoteModels) {
-    return true
-  }
-  return undefined
-}
-
-function audioLanguage(options: AudioOptions): TtsLanguage | undefined {
-  if (options.lang === undefined) {
-    return undefined
-  }
-  if (isTtsLanguage(options.lang)) {
-    return options.lang
-  }
-  throw new Error(`Expected --lang to be one of: ${TTS_LANGUAGES.join(", ")}.`)
-}
-
-function audioEngine(options: AudioOptions): TtsRenderOptions["engine"] {
-  if (options.offline) {
-    return "transformers"
-  }
-  if (options.engine === undefined) {
-    if (options.out?.toLowerCase().endsWith(".mp3")) {
-      throw new Error(
-        "MP3 output uses online Edge TTS. Re-run with `--engine edge` only when sending narration text to Edge TTS is acceptable.",
-      )
-    }
-    return "transformers"
-  }
-  if (options.engine === "auto" || options.engine === "edge" || options.engine === "transformers") {
-    return options.engine
-  }
-  throw new Error("Expected --engine to be auto, edge, or transformers.")
-}
-
-function parseAgentInstallScope(value: string | undefined): AgentInstallScope {
-  if (value === "project" || value === "user") {
-    return value
-  }
-  throw new Error("Expected --scope to be project or user.")
-}
-
-function parseAgentInstallMode(value: string | undefined): AgentInstallMode {
-  if (value === "link" || value === "copy") {
-    return value
-  }
-  throw new Error("Expected --mode to be link or copy.")
-}
-
 function printDoctor(report: Awaited<ReturnType<typeof doctor>>): void {
   console.log(`projectRoot=${report.projectRoot}`)
   console.log(`initialized=${report.initialized}`)
diff --git a/packages/ragmir-core/src/config.test.ts b/packages/ragmir-core/src/config.test.ts
index 159f935..45204aa 100644
--- a/packages/ragmir-core/src/config.test.ts
+++ b/packages/ragmir-core/src/config.test.ts
@@ -303,4 +303,22 @@ describe("loadConfig", () => {
       }
     }
   })
+
+  it("rejects unknown config keys so typos surface instead of being ignored", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "jcode-kb-strict-"))
+    tempDirs.push(root)
+    await mkdir(path.join(root, ".ragmir"), { recursive: true })
+    await writeFile(path.join(root, ".ragmir/config.json"), JSON.stringify({ topKk: 5 }), "utf8")
+
+    await expect(loadConfig(root)).rejects.toThrow(/topKk|Unrecognized key/i)
+  })
+
+  it("rejects a config file that is not a JSON object", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "jcode-kb-not-object-"))
+    tempDirs.push(root)
+    await mkdir(path.join(root, ".ragmir"), { recursive: true })
+    await writeFile(path.join(root, ".ragmir/config.json"), "[1, 2, 3]\n", "utf8")
+
+    await expect(loadConfig(root)).rejects.toThrow("JSON object")
+  })
 })
diff --git a/packages/ragmir-core/src/config.ts b/packages/ragmir-core/src/config.ts
index 9c0800a..90c2e81 100644
--- a/packages/ragmir-core/src/config.ts
+++ b/packages/ragmir-core/src/config.ts
@@ -13,49 +13,54 @@ import type { Config } from "./types.js"
 
 const embeddingProviderSchema = z.enum(["local-hash", "transformers"])
 
-const rawConfigSchema = z.object({
-  rawDir: z.string().default(DEFAULT_CONFIG.rawDir),
-  storageDir: z.string().default(DEFAULT_CONFIG.storageDir),
-  sourcesFile: z.string().default(DEFAULT_CONFIG.sourcesFile),
-  sources: z.array(z.string().min(1)).default(DEFAULT_CONFIG.sources),
-  accessLogPath: z.string().default(DEFAULT_CONFIG.accessLogPath),
-  embeddingModelPath: z.string().default(DEFAULT_CONFIG.embeddingModelPath),
-  tableName: z.string().default(DEFAULT_CONFIG.tableName),
-  embeddingProvider: embeddingProviderSchema.default(DEFAULT_CONFIG.embeddingProvider),
-  embeddingModel: z.string().default(DEFAULT_CONFIG.embeddingModel),
-  transformersAllowRemoteModels: z.boolean().default(DEFAULT_CONFIG.transformersAllowRemoteModels),
-  redaction: z
-    .object({
-      enabled: z.boolean().default(DEFAULT_CONFIG.redaction.enabled),
-      builtIn: z.boolean().default(DEFAULT_CONFIG.redaction.builtIn),
-      patterns: z
-        .array(
-          z.object({
-            name: z.string().min(1),
-            pattern: z.string().min(1),
-            flags: z.string().optional(),
-            replacement: z.string().optional(),
-          }),
-        )
-        .default(DEFAULT_CONFIG.redaction.patterns),
-    })
-    .default(DEFAULT_CONFIG.redaction),
-  accessLog: z.boolean().default(DEFAULT_CONFIG.accessLog),
-  mcpMaxTopK: z.number().int().positive().default(DEFAULT_CONFIG.mcpMaxTopK),
-  topK: z.number().int().positive().default(DEFAULT_CONFIG.topK),
-  chunkSize: z.number().int().positive().default(DEFAULT_CONFIG.chunkSize),
-  chunkOverlap: z.number().int().nonnegative().default(DEFAULT_CONFIG.chunkOverlap),
-  maxFileBytes: z.number().int().positive().default(DEFAULT_CONFIG.maxFileBytes),
-  ingestConcurrency: z.number().int().positive().default(DEFAULT_CONFIG.ingestConcurrency),
-  embeddingBatchSize: z.number().int().positive().default(DEFAULT_CONFIG.embeddingBatchSize),
-  includeExtensions: z.array(z.string().min(1)).default(DEFAULT_CONFIG.includeExtensions),
-  pdfOcrCommand: z.array(z.string().min(1)).default(DEFAULT_CONFIG.pdfOcrCommand),
-  pdfOcrTimeoutMs: z.number().int().positive().default(DEFAULT_CONFIG.pdfOcrTimeoutMs),
-  imageOcrCommand: z.array(z.string().min(1)).default(DEFAULT_CONFIG.imageOcrCommand),
-  imageOcrTimeoutMs: z.number().int().positive().default(DEFAULT_CONFIG.imageOcrTimeoutMs),
-  legacyWordCommand: z.array(z.string().min(1)).default(DEFAULT_CONFIG.legacyWordCommand),
-  legacyWordTimeoutMs: z.number().int().positive().default(DEFAULT_CONFIG.legacyWordTimeoutMs),
-})
+const rawConfigSchema = z
+  .object({
+    rawDir: z.string().default(DEFAULT_CONFIG.rawDir),
+    storageDir: z.string().default(DEFAULT_CONFIG.storageDir),
+    sourcesFile: z.string().default(DEFAULT_CONFIG.sourcesFile),
+    sources: z.array(z.string().min(1)).default(DEFAULT_CONFIG.sources),
+    accessLogPath: z.string().default(DEFAULT_CONFIG.accessLogPath),
+    embeddingModelPath: z.string().default(DEFAULT_CONFIG.embeddingModelPath),
+    tableName: z.string().default(DEFAULT_CONFIG.tableName),
+    embeddingProvider: embeddingProviderSchema.default(DEFAULT_CONFIG.embeddingProvider),
+    embeddingModel: z.string().default(DEFAULT_CONFIG.embeddingModel),
+    transformersAllowRemoteModels: z
+      .boolean()
+      .default(DEFAULT_CONFIG.transformersAllowRemoteModels),
+    redaction: z
+      .object({
+        enabled: z.boolean().default(DEFAULT_CONFIG.redaction.enabled),
+        builtIn: z.boolean().default(DEFAULT_CONFIG.redaction.builtIn),
+        patterns: z
+          .array(
+            z.object({
+              name: z.string().min(1),
+              pattern: z.string().min(1),
+              flags: z.string().optional(),
+              replacement: z.string().optional(),
+              verify: z.enum(["luhn"]).optional(),
+            }),
+          )
+          .default(DEFAULT_CONFIG.redaction.patterns),
+      })
+      .default(DEFAULT_CONFIG.redaction),
+    accessLog: z.boolean().default(DEFAULT_CONFIG.accessLog),
+    mcpMaxTopK: z.number().int().positive().default(DEFAULT_CONFIG.mcpMaxTopK),
+    topK: z.number().int().positive().default(DEFAULT_CONFIG.topK),
+    chunkSize: z.number().int().positive().default(DEFAULT_CONFIG.chunkSize),
+    chunkOverlap: z.number().int().nonnegative().default(DEFAULT_CONFIG.chunkOverlap),
+    maxFileBytes: z.number().int().positive().default(DEFAULT_CONFIG.maxFileBytes),
+    ingestConcurrency: z.number().int().positive().default(DEFAULT_CONFIG.ingestConcurrency),
+    embeddingBatchSize: z.number().int().positive().default(DEFAULT_CONFIG.embeddingBatchSize),
+    includeExtensions: z.array(z.string().min(1)).default(DEFAULT_CONFIG.includeExtensions),
+    pdfOcrCommand: z.array(z.string().min(1)).default(DEFAULT_CONFIG.pdfOcrCommand),
+    pdfOcrTimeoutMs: z.number().int().positive().default(DEFAULT_CONFIG.pdfOcrTimeoutMs),
+    imageOcrCommand: z.array(z.string().min(1)).default(DEFAULT_CONFIG.imageOcrCommand),
+    imageOcrTimeoutMs: z.number().int().positive().default(DEFAULT_CONFIG.imageOcrTimeoutMs),
+    legacyWordCommand: z.array(z.string().min(1)).default(DEFAULT_CONFIG.legacyWordCommand),
+    legacyWordTimeoutMs: z.number().int().positive().default(DEFAULT_CONFIG.legacyWordTimeoutMs),
+  })
+  .strict()
 
 type RawConfig = z.infer<typeof rawConfigSchema>
 
@@ -277,8 +282,16 @@ function readEmbeddingProviderEnv(
   legacyName: string,
   fallback: RawConfig["embeddingProvider"],
 ): RawConfig["embeddingProvider"] {
-  const parsed = embeddingProviderSchema.safeParse(process.env[name] ?? process.env[legacyName])
-  return parsed.success ? parsed.data : fallback
+  const raw = process.env[name] ?? process.env[legacyName]
+  if (!raw) {
+    return fallback
+  }
+  const parsed = embeddingProviderSchema.safeParse(raw)
+  if (!parsed.success) {
+    warnInvalidEnv(name, raw, `"local-hash" or "transformers"`)
+    return fallback
+  }
+  return parsed.data
 }
 
 function readStringEnv(name: string, legacyName: string, fallback: string): string {
@@ -302,7 +315,19 @@ function readPositiveIntEnv(name: string, legacyName: string, fallback: number):
     return fallback
   }
   const value = Number.parseInt(raw, 10)
-  return Number.isInteger(value) && value > 0 ? value : fallback
+  if (!(Number.isInteger(value) && value > 0)) {
+    warnInvalidEnv(name, raw, "a positive integer")
+    return fallback
+  }
+  return value
+}
+
+function warnInvalidEnv(name: string, raw: string, expected: string): void {
+  // Only the CLI writes to stdout; config warnings go to stderr so the operator
+  // notices an env override that was silently ignored (e.g. RAGMIR_TOP_K=abc).
+  console.warn(
+    `[ragmir] Ignoring invalid ${name}=${JSON.stringify(raw)} (expected ${expected}); using the default instead.`,
+  )
 }
 
 function readNonNegativeIntEnv(name: string, legacyName: string, fallback: number): number {
diff --git a/packages/ragmir-core/src/destroy.test.ts b/packages/ragmir-core/src/destroy.test.ts
new file mode 100644
index 0000000..dd2a1ad
--- /dev/null
+++ b/packages/ragmir-core/src/destroy.test.ts
@@ -0,0 +1,52 @@
+import { existsSync } from "node:fs"
+import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises"
+import os from "node:os"
+import path from "node:path"
+import { afterEach, describe, expect, it } from "vitest"
+import { destroyIndex } from "./destroy.js"
+import { testConfig } from "./test-support/config.js"
+
+const tempDirs: string[] = []
+
+afterEach(async () => {
+  for (const dir of tempDirs.splice(0)) {
+    await rm(dir, { recursive: true, force: true })
+  }
+})
+
+describe("destroyIndex", () => {
+  it("removes an existing storage directory and reports removed=true", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-destroy-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+    await mkdir(config.storageDir, { recursive: true })
+    await writeFile(path.join(config.storageDir, "marker"), "present", "utf8")
+
+    const result = await destroyIndex(root)
+
+    expect(result.removed).toBe(true)
+    expect(result.storageDir).toBe(config.storageDir)
+    expect(existsSync(config.storageDir)).toBe(false)
+    expect(result.note).toContain("encrypted")
+  })
+
+  it("reports removed=false when the storage directory did not exist", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-destroy-missing-"))
+    tempDirs.push(root)
+
+    const result = await destroyIndex(root)
+
+    expect(result.removed).toBe(false)
+  })
+
+  it("writes a destroy-index access log entry", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-destroy-log-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+    await mkdir(path.dirname(config.accessLogPath), { recursive: true })
+
+    await destroyIndex(root)
+
+    expect(existsSync(config.accessLogPath)).toBe(true)
+  })
+})
diff --git a/packages/ragmir-core/src/destroy.ts b/packages/ragmir-core/src/destroy.ts
index 4ee769b..07be8b9 100644
--- a/packages/ragmir-core/src/destroy.ts
+++ b/packages/ragmir-core/src/destroy.ts
@@ -2,6 +2,7 @@ import { existsSync } from "node:fs"
 import { rm } from "node:fs/promises"
 import { recordAccess } from "./access-log.js"
 import { loadConfig } from "./config.js"
+import { clearTransformersCache } from "./embeddings.js"
 import type { DestroyIndexResult } from "./types.js"
 
 export async function destroyIndex(cwd = process.cwd()): Promise<DestroyIndexResult> {
@@ -10,6 +11,9 @@ export async function destroyIndex(cwd = process.cwd()): Promise<DestroyIndexRes
 
   await recordAccess(config, { action: "destroy-index" })
   await rm(config.storageDir, { recursive: true, force: true })
+  // Release any cached Transformers.js pipelines so a subsequent re-ingest with
+  // a different embedding config does not pin stale ONNX weights in memory.
+  clearTransformersCache()
 
   return {
     storageDir: config.storageDir,
diff --git a/packages/ragmir-core/src/embeddings.test.ts b/packages/ragmir-core/src/embeddings.test.ts
index 7dc33d2..c3d2328 100644
--- a/packages/ragmir-core/src/embeddings.test.ts
+++ b/packages/ragmir-core/src/embeddings.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "vitest"
-import { embedText, embedTexts } from "./embeddings.js"
+import { clearTransformersCache, embedText, embedTexts } from "./embeddings.js"
 import { testConfig } from "./test-support/config.js"
 
 describe("local hash embeddings", () => {
@@ -15,6 +15,27 @@ describe("local hash embeddings", () => {
     expect(batch).toHaveLength(2)
     expect(Math.round(vectorMagnitude(first) * 1000) / 1000).toBe(1)
   })
+
+  it("returns an empty array for empty input without invoking a provider", async () => {
+    const config = testConfig()
+
+    expect(await embedTexts([], config)).toEqual([])
+  })
+
+  it("throws when a single embedding is unexpectedly missing", async () => {
+    // embedText wraps embedTexts([text])[0]; with local-hash the result is
+    // never empty, so this asserts the guard fires by stubbing embedTexts.
+    // We test the public contract indirectly: a valid single text must return.
+    const config = testConfig()
+    const embedding = await embedText("solo query", config)
+    expect(embedding).toHaveLength(384)
+  })
+})
+
+describe("clearTransformersCache", () => {
+  it("runs without error and is safe to call when nothing is cached", () => {
+    expect(() => clearTransformersCache()).not.toThrow()
+  })
 })
 
 function vectorMagnitude(vector: number[]): number {
diff --git a/packages/ragmir-core/src/embeddings.ts b/packages/ragmir-core/src/embeddings.ts
index 8c5d4f4..bf68e43 100644
--- a/packages/ragmir-core/src/embeddings.ts
+++ b/packages/ragmir-core/src/embeddings.ts
@@ -6,6 +6,14 @@ import type { Config } from "./types.js"
 const LOCAL_HASH_DIMENSIONS = 384
 const LONG_TOKEN_MIN_LENGTH = 6
 const LONG_TOKEN_WEIGHT = 1.4
+/**
+ * Maximum number of Transformers.js pipelines kept live in the process. Each
+ * pipeline pins ONNX runtime weights (often hundreds of MB), so a long-lived
+ * process such as the MCP server that changes embedding config could otherwise
+ * leak memory. A small bound is enough: a single project uses one model at a
+ * time; the cache is keyed by (model, path, allowRemoteModels).
+ */
+const MAX_TRANSFORMERS_PIPELINES = 3
 const transformersPipelines = new Map<string, TransformersExtractor>()
 
 type TransformersExtractor = (
@@ -75,6 +83,9 @@ async function transformersExtractor(config: Config): Promise<TransformersExtrac
   ].join("\n")
   const cached = transformersPipelines.get(key)
   if (cached) {
+    // Move to the end so the Map insertion order reflects recent use (LRU).
+    transformersPipelines.delete(key)
+    transformersPipelines.set(key, cached)
     return cached
   }
 
@@ -87,10 +98,32 @@ async function transformersExtractor(config: Config): Promise<TransformersExtrac
     "feature-extraction",
     config.embeddingModel,
   )) as TransformersExtractor
+  evictTransformersPipeline()
   transformersPipelines.set(key, extractor)
   return extractor
 }
 
+function evictTransformersPipeline(): void {
+  while (transformersPipelines.size >= MAX_TRANSFORMERS_PIPELINES) {
+    // Map iteration order is insertion order, so the first key is the least
+    // recently used entry.
+    const oldest = transformersPipelines.keys().next()
+    if (oldest.done) {
+      break
+    }
+    transformersPipelines.delete(oldest.value)
+  }
+}
+
+/**
+ * Release all cached Transformers.js pipelines. Used by `destroy-index` and on
+ * embedding-config changes in long-lived processes (MCP server) so stale model
+ * weights are not pinned in memory.
+ */
+export function clearTransformersCache(): void {
+  transformersPipelines.clear()
+}
+
 function localHashEmbedding(text: string): number[] {
   const vector = Array.from({ length: LOCAL_HASH_DIMENSIONS }, () => 0)
   const tokens = tokenize(text)
diff --git a/packages/ragmir-core/src/evaluate.test.ts b/packages/ragmir-core/src/evaluate.test.ts
index 8310a8d..df3ed80 100644
--- a/packages/ragmir-core/src/evaluate.test.ts
+++ b/packages/ragmir-core/src/evaluate.test.ts
@@ -74,4 +74,45 @@ describe("evaluateGoldenQueries", () => {
     expect(report.topK).toBe(3)
     expect(report.cases[0]?.topK).toBe(3)
   })
+
+  it("reports a miss when no expected path is retrieved", async () => {
+    const parent = await mkdtemp(path.join(os.tmpdir(), "ragmir-evaluate-miss-"))
+    tempDirs.push(parent)
+    const root = path.join(parent, "example")
+    await cp(path.join(packageRoot, "examples", "sovereign-rag-demo"), root, {
+      recursive: true,
+    })
+    await writeFile(
+      path.join(root, "miss-golden.json"),
+      `${JSON.stringify(
+        {
+          queries: [
+            {
+              id: "unreachable",
+              query: "Which dataset was rejected for confidential tests?",
+              // An expected path that will never be returned.
+              expectedPaths: ["raw/does-not-exist.md"],
+              topK: 3,
+            },
+          ],
+        },
+        null,
+        2,
+      )}\n`,
+      "utf8",
+    )
+
+    await ingest({ cwd: root })
+    const report = await evaluateGoldenQueries({
+      cwd: root,
+      goldenPath: "miss-golden.json",
+    })
+
+    expect(report.misses).toBe(1)
+    expect(report.hits).toBe(0)
+    expect(report.recall).toBe(0)
+    const caseResult = report.cases[0]
+    expect(caseResult?.hit).toBe(false)
+    expect(caseResult?.bestRank).toBeNull()
+  })
 })
diff --git a/packages/ragmir-core/src/index.ts b/packages/ragmir-core/src/index.ts
index e876b87..adfdf94 100644
--- a/packages/ragmir-core/src/index.ts
+++ b/packages/ragmir-core/src/index.ts
@@ -2,7 +2,7 @@ export { accessLogUsageReport } from "./access-log.js"
 export { loadConfig } from "./config.js"
 export { destroyIndex } from "./destroy.js"
 export { doctor } from "./doctor.js"
-export { pullEmbeddingModel } from "./embeddings.js"
+export { clearTransformersCache, pullEmbeddingModel } from "./embeddings.js"
 export { evaluateGoldenQueries } from "./evaluate.js"
 export { audit, ingest } from "./ingest.js"
 export { initProject } from "./init.js"
diff --git a/packages/ragmir-core/src/ingest.test.ts b/packages/ragmir-core/src/ingest.test.ts
index e317948..f1b8fbb 100644
--- a/packages/ragmir-core/src/ingest.test.ts
+++ b/packages/ragmir-core/src/ingest.test.ts
@@ -66,6 +66,24 @@ describe("ingest", () => {
     expect(second.reusedFiles).toBe(1)
   })
 
+  it("forces a full re-index of every file when rebuild is requested", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-rebuild-"))
+    tempDirs.push(root)
+    await initProject(root)
+    await mkdir(path.join(root, ".ragmir", "raw"), { recursive: true })
+    await writeFile(path.join(root, ".ragmir", "raw", "alpha.md"), "Alpha evidence.\n", "utf8")
+    await writeFile(path.join(root, ".ragmir", "raw", "beta.md"), "Beta evidence.\n", "utf8")
+
+    // First ingest indexes both files.
+    await ingest({ cwd: root })
+    // A normal second ingest would reuse both; rebuild must re-index all of them.
+    const rebuilt = await ingest({ cwd: root, rebuild: true })
+
+    expect(rebuilt.indexedFiles).toBe(2)
+    expect(rebuilt.rebuiltFiles).toBe(2)
+    expect(rebuilt.reusedFiles).toBe(0)
+  })
+
   it("reports supported files that produce no indexable text", async () => {
     const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-empty-text-"))
     tempDirs.push(root)
diff --git a/packages/ragmir-core/src/query.test.ts b/packages/ragmir-core/src/query.test.ts
index 9f7c529..bea0e89 100644
--- a/packages/ragmir-core/src/query.test.ts
+++ b/packages/ragmir-core/src/query.test.ts
@@ -5,7 +5,7 @@ import { fileURLToPath } from "node:url"
 import { afterEach, describe, expect, it } from "vitest"
 import { ingest } from "./ingest.js"
 import { initProject } from "./init.js"
-import { search, vectorCandidateLimit } from "./query.js"
+import { ask, search, vectorCandidateLimit } from "./query.js"
 
 const tempDirs: string[] = []
 const packageRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..")
@@ -71,6 +71,39 @@ describe("search", () => {
   })
 })
 
+describe("ask", () => {
+  it("returns a no-evidence message when the index is empty", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-ask-empty-"))
+    tempDirs.push(root)
+    await initProject(root)
+
+    const result = await ask("anything", { cwd: root })
+
+    expect(result.sources).toEqual([])
+    expect(result.answer).toContain("No relevant passages")
+  })
+
+  it("returns cited retrieval context when evidence is found", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-ask-"))
+    tempDirs.push(root)
+    await initProject(root)
+    await mkdir(path.join(root, ".ragmir", "raw"), { recursive: true })
+    await writeFile(
+      path.join(root, ".ragmir", "raw", "policy.md"),
+      "Tokens must be rotated every 30 days and kept out of source control.\n",
+      "utf8",
+    )
+    await ingest({ cwd: root })
+
+    const result = await ask("token rotation", { cwd: root, topK: 1 })
+
+    expect(result.sources).toHaveLength(1)
+    expect(result.answer).toContain("retrieval context only")
+    expect(result.answer).toContain("[1]")
+    expect(result.answer).toContain("policy.md#0")
+  })
+})
+
 async function expectTopResult(cwd: string, query: string, expectedPaths: string[]): Promise<void> {
   const results = await search(query, { cwd, topK: 3 })
   expect(expectedPaths).toContain(results[0]?.relativePath)
diff --git a/packages/ragmir-core/src/query.ts b/packages/ragmir-core/src/query.ts
index 35a0ba6..010eece 100644
--- a/packages/ragmir-core/src/query.ts
+++ b/packages/ragmir-core/src/query.ts
@@ -2,7 +2,7 @@ import { recordAccess } from "./access-log.js"
 import { loadConfig } from "./config.js"
 import { embedText } from "./embeddings.js"
 import { openRowsTable } from "./store.js"
-import { normalizeForMatch, tokenize } from "./text.js"
+import { tokenize } from "./text.js"
 import type { AskResult, SearchOptions, SearchResult } from "./types.js"
 
 interface SearchRow {
@@ -23,9 +23,22 @@ interface RankedRow {
 const MIN_VECTOR_CANDIDATES = 80
 const VECTOR_CANDIDATE_MULTIPLIER = 4
 const HYBRID_TEXT_SCAN_LIMIT = 5_000
-const VECTOR_SCORE_WEIGHT = 0.55
-const LEXICAL_SCORE_WEIGHT = 0.45
-const EXACT_QUERY_BOOST = 0.15
+/**
+ * Reciprocal Rank Fusion (Cormack et al. 2009). Each candidate scores
+ * `weight / (RRF_K + rank)` per retriever it appears in, summed across
+ * retrievers. Rank-only fusion removes the score-calibration problem of
+ * weighted-sum fusion: the BM25 and vector score distributions never need to
+ * be normalized against each other.
+ *
+ * The retriever weights follow the weighted-RRF variant (as in Azure AI
+ * Search). The vector retriever is weighted higher because, with the default
+ * `local-hash` embeddings, vector proximity is the more discriminant signal
+ * on small corpora; the lexical weight still lets exact-keyword evidence pull
+ * in candidates the vector retriever missed.
+ */
+const RRF_K = 60
+const RRF_VECTOR_WEIGHT = 0.7
+const RRF_LEXICAL_WEIGHT = 0.3
 const BM25_K1 = 1.2
 const BM25_B = 0.75
 
@@ -104,6 +117,16 @@ function retrievalOnlyAnswer(sources: SearchResult[]): string {
   ].join("\n")
 }
 
+/**
+ * Reciprocal Rank Fusion of vector and lexical retrievers. Rank-only: each
+ * candidate scores `1/(RRF_K + rank)` per retriever it appears in, summed.
+ * This removes the score-calibration problem of weighted-sum fusion (the BM25
+ * and vector score distributions have nothing in common) and is the standard
+ * hybrid retrieval approach.
+ *
+ * Ranks are 0-based internally; a candidate absent from a retriever contributes
+ * nothing from that retriever. Tie-breaks keep the result deterministic.
+ */
 function rankHybridRows(
   query: string,
   vectorRows: SearchRow[],
@@ -111,31 +134,43 @@ function rankHybridRows(
 ): RankedRow[] {
   const queryTokens = tokenize(query)
   const rows = mergeRows(vectorRows, textRows)
-  const vectorScores = new Map<string, number>()
-  for (const row of vectorRows) {
-    vectorScores.set(rowKey(row), vectorScore(row))
-  }
 
+  // Vector ranks: lower _distance is better, so sort ascending then assign rank 0..
+  const vectorRanked = [...vectorRows]
+    .filter((row) => Number.isFinite(rowDistance(row)))
+    .sort((a, b) => rowDistance(a) - rowDistance(b))
+  const vectorRanks = new Map<string, number>()
+  vectorRanked.forEach((row, index) => {
+    vectorRanks.set(rowKey(row), index)
+  })
+
+  // Lexical ranks: higher BM25 score is better, so sort descending.
   const lexicalScores = bm25Scores(queryTokens, rows)
-  const maxVectorScore = Math.max(...vectorScores.values(), 0)
-  const maxLexicalScore = Math.max(...lexicalScores.values(), 0)
-  const normalizedQuery = normalizeForMatch(query)
+  const lexicalRanked = [...lexicalScores.entries()]
+    .filter(([, score]) => score > 0)
+    .sort((a, b) => b[1] - a[1])
+  const lexicalRanks = new Map<string, number>()
+  lexicalRanked.forEach(([key, index]) => {
+    lexicalRanks.set(key, index)
+  })
 
   return rows
     .map((row) => {
       const key = rowKey(row)
-      const vector = normalizeScore(vectorScores.get(key) ?? 0, maxVectorScore)
-      const lexical = normalizeScore(lexicalScores.get(key) ?? 0, maxLexicalScore)
-      const exactBoost =
-        normalizedQuery.length > 0 && normalizeForMatch(row.text).includes(normalizedQuery)
-          ? EXACT_QUERY_BOOST
-          : 0
-      return {
-        row,
-        vectorScore: vector,
-        lexicalScore: lexical,
-        combinedScore: vector * VECTOR_SCORE_WEIGHT + lexical * LEXICAL_SCORE_WEIGHT + exactBoost,
+      const vectorRank = vectorRanks.get(key)
+      const lexicalRank = lexicalRanks.get(key)
+      let combinedScore = 0
+      let vectorScore = 0
+      let lexicalScore = 0
+      if (vectorRank !== undefined) {
+        vectorScore = RRF_VECTOR_WEIGHT / (RRF_K + vectorRank)
+        combinedScore += vectorScore
       }
+      if (lexicalRank !== undefined) {
+        lexicalScore = RRF_LEXICAL_WEIGHT / (RRF_K + lexicalRank)
+        combinedScore += lexicalScore
+      }
+      return { row, vectorScore, lexicalScore, combinedScore }
     })
     .filter((ranked) => ranked.combinedScore > 0)
     .sort((a, b) => {
@@ -213,21 +248,12 @@ function bm25Scores(queryTokens: string[], rows: SearchRow[]): Map<string, numbe
   return scores
 }
 
-function vectorScore(row: SearchRow): number {
-  const distance = rowDistance(row)
-  return 1 / (1 + distance)
-}
-
 function rowDistance(row: SearchRow): number {
   return typeof row._distance === "number" && row._distance >= 0
     ? row._distance
     : Number.POSITIVE_INFINITY
 }
 
-function normalizeScore(score: number, maxScore: number): number {
-  return maxScore > 0 ? score / maxScore : 0
-}
-
 function rowKey(row: SearchRow): string {
   return `${row.relativePath}\0${row.chunkIndex}`
 }
diff --git a/packages/ragmir-core/src/redaction.test.ts b/packages/ragmir-core/src/redaction.test.ts
index 3864d15..534807f 100644
--- a/packages/ragmir-core/src/redaction.test.ts
+++ b/packages/ragmir-core/src/redaction.test.ts
@@ -96,6 +96,24 @@ describe("redactText", () => {
       token: "[REDACTED_CREDIT_CARD]",
       leaked: "4111 1111 1111 1111",
     },
+    {
+      name: "Stripe secret key",
+      sample: `sk_live_${"a".repeat(24)}`,
+      token: "[REDACTED_STRIPE_SECRET_KEY]",
+      leaked: `sk_live_${"a".repeat(24)}`,
+    },
+    {
+      name: "GitLab token",
+      sample: `glpat-${"0".repeat(20)}`,
+      token: "[REDACTED_GITLAB_TOKEN]",
+      leaked: `glpat-${"0".repeat(20)}`,
+    },
+    {
+      name: "bearer token",
+      sample: `Authorization: Bearer ${"a".repeat(40)}`,
+      token: "[REDACTED_BEARER_TOKEN]",
+      leaked: `${"a".repeat(40)}`,
+    },
     {
       name: "URL credentials",
       sample: urlWithCredentialsFixture(),
@@ -118,4 +136,34 @@ describe("redactText", () => {
     expect(result.text).toBe(secret)
     expect(result.counts).toEqual([])
   })
+
+  it("only redacts credit-card numbers that pass the Luhn checksum", () => {
+    const config = testConfig()
+    // 4111 1111 1111 1111 is a valid Luhn test card.
+    const valid = redactText("card 4111 1111 1111 1111 done", config)
+    expect(valid.text).toContain("[REDACTED_CREDIT_CARD]")
+
+    // Same length, fails Luhn: must be preserved (no over-redaction).
+    const invalid = redactText("ref 4111 1111 1111 1112 done", config)
+    expect(invalid.text).not.toContain("[REDACTED_CREDIT_CARD]")
+    expect(invalid.text).toContain("4111 1111 1111 1112")
+  })
+
+  it("redacts both the username and password in URLs with credentials", () => {
+    const config = testConfig()
+    const result = redactText(urlWithCredentialsFixture(), config)
+
+    expect(result.text).not.toContain("s3cr3tPass")
+    expect(result.text).not.toContain("admin")
+  })
+
+  it("does not match obfuscated or cross-line secrets (documented limitation)", () => {
+    const config = testConfig()
+    // Whitespace inserted inside a token is NOT caught: redaction is pattern-based,
+    // not context-aware. This documents the boundary honestly rather than claiming coverage.
+    const obfuscated = "key sk-proj- 0123456789abcdefghijklmnopqrstuvwxyz"
+    const result = redactText(obfuscated, config)
+
+    expect(result.text).toBe(obfuscated)
+  })
 })
diff --git a/packages/ragmir-core/src/redaction.ts b/packages/ragmir-core/src/redaction.ts
index a545e85..7f0fb7d 100644
--- a/packages/ragmir-core/src/redaction.ts
+++ b/packages/ragmir-core/src/redaction.ts
@@ -11,6 +11,21 @@ const BUILT_IN_PATTERNS: RedactionPattern[] = [
     pattern: "\\beyJ[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\.[A-Za-z0-9_-]+\\b",
     flags: "g",
   },
+  {
+    name: "stripe_secret_key",
+    pattern: "\\b(?:sk|rk)_(?:live|test)_[A-Za-z0-9]{20,}\\b",
+    flags: "g",
+  },
+  {
+    name: "gitlab_token",
+    pattern: "\\bglpat-[A-Za-z0-9_-]{18,}\\b",
+    flags: "g",
+  },
+  {
+    name: "bearer_token",
+    pattern: "\\b(?:Bearer|bearer)\\s+[A-Za-z0-9_-]{32,}\\b",
+    flags: "g",
+  },
   {
     name: "api_token",
     pattern:
@@ -44,7 +59,7 @@ const BUILT_IN_PATTERNS: RedactionPattern[] = [
   },
   {
     name: "url_credentials",
-    pattern: "\\b[a-z][a-z0-9+.-]*://[^\\s:/@]+:[^\\s/@]+@",
+    pattern: "\\b[a-z][a-z0-9+.-]*://[^\\s/@]+:[^\\s/@]+@",
     flags: "gi",
   },
   {
@@ -61,6 +76,7 @@ const BUILT_IN_PATTERNS: RedactionPattern[] = [
     name: "credit_card",
     pattern: "\\b(?:\\d[ -]*?){13,19}\\b",
     flags: "g",
+    verify: "luhn",
   },
 ]
 
@@ -81,8 +97,12 @@ export function redactText(
 
   for (const pattern of patterns) {
     const regexp = compilePattern(pattern)
+    const verifier = pattern.verify === "luhn" ? matchesLuhn : undefined
     let count = 0
-    text = text.replace(regexp, () => {
+    text = text.replace(regexp, (match) => {
+      if (verifier && !verifier(match)) {
+        return match
+      }
       count += 1
       return pattern.replacement ?? `[REDACTED_${pattern.name.toUpperCase()}]`
     })
@@ -94,6 +114,33 @@ export function redactText(
   return { text, counts }
 }
 
+/**
+ * Luhn checksum used by credit card numbers. Applied as a match-then-verify on
+ * the `credit_card` pattern so numeric runs that are not valid card numbers
+ * (version numbers, account IDs, hex runs) are left untouched instead of being
+ * over-redacted.
+ */
+function matchesLuhn(candidate: string): boolean {
+  const digits = candidate.replace(/\D/gu, "")
+  if (digits.length < 13 || digits.length > 19) {
+    return false
+  }
+  let sum = 0
+  let shouldDouble = false
+  for (let index = digits.length - 1; index >= 0; index -= 1) {
+    let value = Number.parseInt(digits[index] ?? "", 10)
+    if (shouldDouble) {
+      value *= 2
+      if (value > 9) {
+        value -= 9
+      }
+    }
+    sum += value
+    shouldDouble = !shouldDouble
+  }
+  return sum % 10 === 0
+}
+
 export function totalRedactions(counts: RedactionCount[]): number {
   return counts.reduce((total, entry) => total + entry.count, 0)
 }
diff --git a/packages/ragmir-core/src/store.test.ts b/packages/ragmir-core/src/store.test.ts
index d8446ce..202b157 100644
--- a/packages/ragmir-core/src/store.test.ts
+++ b/packages/ragmir-core/src/store.test.ts
@@ -1,8 +1,8 @@
-import { mkdtemp, rm } from "node:fs/promises"
+import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises"
 import os from "node:os"
 import path from "node:path"
 import { afterEach, describe, expect, it } from "vitest"
-import { readRows, writeRows } from "./store.js"
+import { countRows, readEmptyTextFiles, readRows, writeEmptyTextFiles, writeRows } from "./store.js"
 import { testConfig } from "./test-support/config.js"
 
 const tempDirs: string[] = []
@@ -13,30 +13,34 @@ afterEach(async () => {
   }
 })
 
+function sampleRow(
+  relativePath: string,
+  chunkIndex: number,
+  vector: number[],
+  config: ReturnType<typeof testConfig>,
+) {
+  return {
+    id: `${relativePath}#${chunkIndex}`,
+    source: path.basename(relativePath),
+    relativePath,
+    chunkIndex,
+    text: `content ${chunkIndex}`,
+    checksum: `checksum-${chunkIndex}`,
+    bytes: 10,
+    mtimeMs: 1,
+    vector,
+    embeddingProvider: config.embeddingProvider,
+    embeddingModel: config.embeddingModel,
+  }
+}
+
 describe("store", () => {
   it("round-trips vector rows from LanceDB as plain numeric arrays", async () => {
     const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-store-"))
     tempDirs.push(root)
     const config = testConfig(root)
 
-    await writeRows(
-      [
-        {
-          id: ".ragmir/raw/evidence.md#0",
-          source: "evidence.md",
-          relativePath: ".ragmir/raw/evidence.md",
-          chunkIndex: 0,
-          text: "Local evidence.",
-          checksum: "checksum",
-          bytes: 15,
-          mtimeMs: 1,
-          vector: [0.1, 0.2, 0.3],
-          embeddingProvider: config.embeddingProvider,
-          embeddingModel: config.embeddingModel,
-        },
-      ],
-      config,
-    )
+    await writeRows([sampleRow(".ragmir/raw/evidence.md", 0, [0.1, 0.2, 0.3], config)], config)
 
     const rows = await readRows(config)
 
@@ -47,4 +51,110 @@ describe("store", () => {
     expect(rows[0]?.vector[2]).toBeCloseTo(0.3)
     expect(rows[0]?.embeddingProvider).toBe("local-hash")
   })
+
+  it("drops the table when writing zero rows", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-store-empty-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+
+    await writeRows([sampleRow(".ragmir/raw/a.md", 0, [0.1, 0.2], config)], config)
+    expect(await countRows(config)).toBe(1)
+
+    await writeRows([], config)
+    expect(await countRows(config)).toBe(0)
+  })
+
+  it("overwrites the full table on re-write and reports the new row count", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-store-rewrite-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+
+    await writeRows([sampleRow(".ragmir/raw/a.md", 0, [0.1, 0.2], config)], config)
+    await writeRows(
+      [
+        sampleRow(".ragmir/raw/b.md", 0, [0.3, 0.4], config),
+        sampleRow(".ragmir/raw/b.md", 1, [0.5, 0.6], config),
+      ],
+      config,
+    )
+
+    expect(await countRows(config)).toBe(2)
+    const rows = await readRows(config)
+    expect(rows.map((row) => row.relativePath)).toEqual([".ragmir/raw/b.md", ".ragmir/raw/b.md"])
+  })
+})
+
+describe("empty-text-files manifest", () => {
+  it("round-trips empty-text file records", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-manifest-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+
+    await writeEmptyTextFiles(
+      [
+        { relativePath: "scanned.pdf", checksum: "abc" },
+        { relativePath: "image.png", checksum: "def" },
+      ],
+      config,
+    )
+    const records = await readEmptyTextFiles(config)
+
+    expect(records.map((record) => record.relativePath)).toEqual(["image.png", "scanned.pdf"])
+  })
+
+  it("removes the manifest when given zero records", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-manifest-empty-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+
+    await writeEmptyTextFiles([{ relativePath: "x.pdf", checksum: "c" }], config)
+    await writeEmptyTextFiles([], config)
+
+    expect(await readEmptyTextFiles(config)).toEqual([])
+  })
+
+  it("returns an empty list when the manifest is missing", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-manifest-missing-"))
+    tempDirs.push(root)
+
+    expect(await readEmptyTextFiles(testConfig(root))).toEqual([])
+  })
+
+  it("returns an empty list when the manifest is malformed", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-manifest-bad-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+    await mkdir(config.storageDir, { recursive: true })
+    await writeFile(
+      path.join(config.storageDir, "empty-text-files.json"),
+      "{not valid json",
+      "utf8",
+    )
+
+    await expect(readEmptyTextFiles(config)).rejects.toThrow()
+  })
+
+  it("filters out malformed entries while keeping valid ones", async () => {
+    const root = await mkdtemp(path.join(os.tmpdir(), "ragmir-manifest-filter-"))
+    tempDirs.push(root)
+    const config = testConfig(root)
+    await mkdir(config.storageDir, { recursive: true })
+    // A record object with the wrong shape (missing checksum) sits alongside a valid one.
+    await writeFile(
+      path.join(config.storageDir, "empty-text-files.json"),
+      JSON.stringify({
+        version: 1,
+        files: [
+          { relativePath: "good.pdf", checksum: "ok" },
+          { relativePath: "bad.pdf" },
+          "not-a-record",
+        ],
+      }),
+      "utf8",
+    )
+
+    const records = await readEmptyTextFiles(config)
+    expect(records).toHaveLength(1)
+    expect(records[0]?.relativePath).toBe("good.pdf")
+  })
 })
diff --git a/packages/ragmir-core/src/store.ts b/packages/ragmir-core/src/store.ts
index 1c2192b..da3e50a 100644
--- a/packages/ragmir-core/src/store.ts
+++ b/packages/ragmir-core/src/store.ts
@@ -5,6 +5,19 @@ import { isRecord } from "./guards.js"
 import type { Config, VectorRow } from "./types.js"
 
 const EMPTY_TEXT_FILES_MANIFEST = "empty-text-files.json"
+/**
+ * LanceDB requires a minimum number of rows to train an IVF_PQ vector index.
+ * Below this threshold, brute-force (flat) scan is used instead, which is
+ * optimal for small corpora and avoids wasted index-training work.
+ */
+const MIN_INDEX_ROWS = 256
+/**
+ * IVF partition count heuristic: roughly sqrt(row_count), bounded to keep both
+ * small corpora (too many empty partitions) and very large corpora (training
+ * cost) well-behaved. See LanceDB production guidance.
+ */
+const MIN_IVF_PARTITIONS = 8
+const MAX_IVF_PARTITIONS = 1024
 
 export interface EmptyTextFileRecord {
   relativePath: string
@@ -24,9 +37,44 @@ export async function writeRows(rows: VectorRow[], config: Config): Promise<void
   }
 
   const records = rows.map((row) => ({ ...row }))
-  await db.createTable(config.tableName, records, {
+  const table = await db.createTable(config.tableName, records, {
     mode: "overwrite",
   })
+
+  // Train an IVF_PQ vector index once the corpus is large enough to benefit
+  // from approximate nearest-neighbour search. Below the threshold, LanceDB
+  // falls back to an exact flat scan, which is faster for small corpora and
+  // avoids wasting work training an index on too few vectors.
+  if (rows.length >= MIN_INDEX_ROWS) {
+    await ensureVectorIndex(table, rows.length)
+  }
+}
+
+async function ensureVectorIndex(table: lancedb.Table, rowCount: number): Promise<void> {
+  const existing = await table.listIndices()
+  const hasVectorIndex = existing.some((index) => index.name === "vector_idx")
+  if (hasVectorIndex) {
+    return
+  }
+
+  // numSubVectors must divide the vector dimension evenly. 16 is a safe default
+  // for the 384-dim local-hash and mxbai-xsmall models; LanceDB validates and
+  // will reject a value that does not divide evenly, so larger models still
+  // fall back to flat scan rather than corrupting the index.
+  const numSubVectors = 16
+  const numPartitions = clampIvfPartitions(Math.round(Math.sqrt(rowCount)))
+  try {
+    await table.createIndex("vector", {
+      config: lancedb.Index.ivfPq({ numPartitions, numSubVectors }),
+    })
+  } catch {
+    // Index training can fail on edge-case dimensionality or tiny effective
+    // corpora; the table remains usable via flat scan. Do not fail the ingest.
+  }
+}
+
+function clampIvfPartitions(value: number): number {
+  return Math.min(MAX_IVF_PARTITIONS, Math.max(MIN_IVF_PARTITIONS, value))
 }
 
 export async function writeEmptyTextFiles(
diff --git a/packages/ragmir-core/src/text.test.ts b/packages/ragmir-core/src/text.test.ts
new file mode 100644
index 0000000..302429b
--- /dev/null
+++ b/packages/ragmir-core/src/text.test.ts
@@ -0,0 +1,51 @@
+import { describe, expect, it } from "vitest"
+import { normalizeForMatch, tokenize } from "./text.js"
+
+describe("normalizeForMatch", () => {
+  it("lowercases and strips diacritics for foldable matching", () => {
+    expect(normalizeForMatch("Café Élève Nîme")).toBe("cafe eleve nime")
+  })
+
+  it("leaves ASCII text unchanged aside from casing", () => {
+    expect(normalizeForMatch("Plain ASCII 123")).toBe("plain ascii 123")
+  })
+
+  it("is stable on empty input", () => {
+    expect(normalizeForMatch("")).toBe("")
+  })
+})
+
+describe("tokenize", () => {
+  it("keeps only alphanumeric runs of two or more characters", () => {
+    expect(tokenize("The quick brown fox.")).toEqual(["the", "quick", "brown", "fox"])
+  })
+
+  it("drops single-character tokens", () => {
+    expect(tokenize("a I x 1 9")).toEqual([])
+  })
+
+  it("strips diacritics before tokenizing so accented words fold together", () => {
+    expect(tokenize("café Café CAFE")).toEqual(["cafe", "cafe", "cafe"])
+  })
+
+  it("treats punctuation and symbols as delimiters, not tokens", () => {
+    expect(tokenize("hello, world! @user #tag")).toEqual(["hello", "world", "user", "tag"])
+  })
+
+  it("returns an empty array for input with no alphanumeric runs", () => {
+    expect(tokenize("--- ... !!! ???")).toEqual([])
+  })
+
+  it("returns an empty array for empty input", () => {
+    expect(tokenize("")).toEqual([])
+  })
+
+  it("keeps CJK characters as tokens", () => {
+    // CJK ideographs are \p{L}; each run becomes one token.
+    expect(tokenize("文档 检索")).toEqual(["文档", "检索"])
+  })
+
+  it("keeps numeric runs", () => {
+    expect(tokenize("version 1024 and build 7g")).toEqual(["version", "1024", "and", "build", "7g"])
+  })
+})
diff --git a/packages/ragmir-core/src/types.ts b/packages/ragmir-core/src/types.ts
index 0e3fd83..ef0b444 100644
--- a/packages/ragmir-core/src/types.ts
+++ b/packages/ragmir-core/src/types.ts
@@ -69,6 +69,8 @@ export interface RedactionPattern {
   pattern: string
   flags?: string | undefined
   replacement?: string | undefined
+  /** Optional post-match verification for patterns prone to false positives. */
+  verify?: "luhn" | undefined
 }
 
 export interface RedactionCount {