From 5166cd7dd9bf4c1316bce81da337c7cc59f7e532 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Thu, 14 May 2026 11:27:07 -0600
Subject: [PATCH] =?UTF-8?q?feat(0.25.0):=20ProductionLoop=20primitive=20?=
 =?UTF-8?q?=E2=80=94=20close=20the=20eval=E2=86=92prod=E2=86=92eval=20cycl?=
 =?UTF-8?q?e?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pieces to close the loop were already in the package
(runMultiShotOptimization, failureClusterView, evaluateReleaseConfidence,
extractPreferences, FeedbackTrajectoryStore, TraceStore). This release
adds the one clean primitive that wires them end-to-end.

- runProductionLoop({...}): one call = one cycle. Ingest prod traces +
  feedback, cluster failures, evolve against the worst cluster, gate
  fail-closed, open a PR with the new prompt. Idempotent + replayable;
  cron is the consumer's job.
- proposeAutomatedPullRequest + ghCliClient / httpGithubClient (no new
  deps; both transports tested with fakes).
- POST /v1/feedback + POST /v1/traces/ingest (NDJSON-capable) wire
  endpoints. Optional bearer auth (healthz / version stay exempt).
- examples/production-loop: runnable synthetic end-to-end demo.
- 33 new tests (1052 total, up from 1019). Typecheck + biome + build
  clean. openapi.json regenerated.
---
 CHANGELOG.md                          |  65 +++
 README.md                             |  71 +++
 clients/python/pyproject.toml         |   2 +-
 examples/production-loop/README.md    |  79 +++
 examples/production-loop/index.ts     | 292 +++++++++++
 package.json                          |   2 +-
 src/auto-pr.ts                        | 509 +++++++++++++++++++
 src/index.ts                          |  36 ++
 src/production-loop.ts                | 679 ++++++++++++++++++++++++++
 src/wire/handlers.ts                  |  85 +++-
 src/wire/openapi.ts                   |  75 +++
 src/wire/schemas.ts                   | 165 +++++++
 src/wire/server.ts                    | 129 ++++-
 tests/auto-pr.test.ts                 | 290 +++++++++++
 tests/production-loop-example.test.ts | 186 +++++++
 tests/production-loop.test.ts         | 460 +++++++++++++++++
 tests/wire-ingestion.test.ts          | 295 +++++++++++
 17 files changed, 3409 insertions(+), 11 deletions(-)
 create mode 100644 examples/production-loop/README.md
 create mode 100644 examples/production-loop/index.ts
 create mode 100644 src/auto-pr.ts
 create mode 100644 src/production-loop.ts
 create mode 100644 tests/auto-pr.test.ts
 create mode 100644 tests/production-loop-example.test.ts
 create mode 100644 tests/production-loop.test.ts
 create mode 100644 tests/wire-ingestion.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 452f9e8..c56c397 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,70 @@
 # Changelog
 
+## 0.25.0 — ProductionLoop primitive: close the eval → prod → eval cycle
+
+This release ships the **orchestration layer** that turns the existing
+eval substrate into a continuously-improving production system. Static
+prompts decay; today's regulation flips tomorrow. The pieces to close
+the loop were already in the package (`runMultiShotOptimization`,
+`failureClusterView`, `evaluateReleaseConfidence`, `extractPreferences`,
+`FeedbackTrajectoryStore`, `TraceStore`); this release adds the one
+clean primitive that wires them together end-to-end.
+
+### Added
+
+- **`runProductionLoop({ ... })`** (`src/production-loop.ts`,
+  `@experimental`) — one call = one cycle. Ingests production traces
+  and feedback, clusters failures, runs evolve against the worst
+  cluster, gates with `HeldOutGate` + `evaluateReleaseConfidence`
+  (fail-closed), and — when wired with an `AutoPrClient` — opens a PR
+  with the improved prompt. Idempotent + replayable: same `runId`
+  yields the same plan. Cron / GitHub Actions are the consumer's job;
+  the primitive doesn't own scheduling.
+
+- **`proposeAutomatedPullRequest(client, input)`** + two transports
+  (`src/auto-pr.ts`, `@experimental`):
+    - `httpGithubClient({ token, ... })` — direct REST against
+      `api.github.com`, no extra deps. Idempotent on branch name:
+      existing open PRs are returned, not duplicated.
+    - `ghCliClient({ ... })` — shells out to `gh` for environments
+      where developer auth state is already configured.
+  Both validate inputs (no `..` paths, no whitespace branches, no
+  duplicate file changes) and surface `ValidationError` / `ConfigError`
+  from the typed taxonomy.
+
+- **`POST /v1/feedback` + `POST /v1/traces/ingest`** wire endpoints
+  (`src/wire/`). Both Zod-validated, both append to the configured
+  store (`FeedbackTrajectoryStore` / `TraceStore`). 503 when no store
+  is wired (fail loud, not silent). Traces ingest accepts both
+  `application/json` (`{events:[...]}`) and `application/x-ndjson` for
+  streaming production runtimes. Schemas (`TraceEvent`,
+  `FeedbackTrajectory`, `TracesIngestRequest/Response`,
+  `FeedbackIngestResponse`) added to `openapi.json` for cross-language
+  clients.
+
+- **Optional bearer-token auth** on the wire server, configured via
+  `createApp({ auth: { bearer: '...' } })` or as a verifier function
+  for rotating tokens. `/healthz` and `/v1/version` remain unprotected
+  (regression: never lock monitoring out of the runtime).
+
+- **`examples/production-loop/`** — synthetic end-to-end demo wiring
+  the loop against in-memory trace + feedback stores and a fake
+  auto-PR client. Shows the failure-cluster trigger, the evolve round,
+  the gate verdict, and the PR-shaped output without requiring
+  credentials or a live model.
+
+### Changed
+
+- **Wire server** (`createApp(opts)`) now accepts optional
+  `IngestionStores` (`{ traceStore?, feedbackStore? }`) and `auth`.
+  Existing zero-arg callers continue to work — judge / rubrics /
+  version / healthz are unchanged.
+
+### Status tags
+
+- Every new export is `@experimental` initially. Pin the patch version
+  if you depend on it. All other 0.24.0 stability tags are preserved.
+
 ## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
 
 This release is **DX + correctness**. No production behavior moved; consumer
diff --git a/README.md b/README.md
index 0a11bf8..80ae63d 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,75 @@ await product.storeEvalResult(task.id, result)
 Same loop shape in production, replay, benchmark, and optimization. Swap the
 dependencies behind `observe()` and `act()`, never the eval contract.
 
+## Production loop — close the eval → prod → eval cycle (0.25.0)
+
+Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk
+becomes today's incident. The production agents that win are the ones that
+**continuously re-train against live failure modes**.
+
+`runProductionLoop` is the orchestration layer that wires the existing eval
+substrate into a self-improvement cron:
+
+```ts
+import {
+  runProductionLoop,
+  httpGithubClient,
+  FileSystemFeedbackTrajectoryStore,
+} from '@tangle-network/agent-eval'
+import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces'
+
+const result = await runProductionLoop({
+  runId: `weekly-${new Date().toISOString().slice(0, 10)}`,
+  target: 'tax-agent',
+
+  // 1. Where production traces + feedback land. Wire the HTTP ingestion
+  //    endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your
+  //    runtime; the same store reads them here.
+  traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }),
+  feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }),
+
+  // 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus.
+  cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
+
+  // 3. Evolve: seed = current prompt, gate against holdout scenarios.
+  evolve: {
+    baselinePrompt: currentSystemPrompt,
+    holdoutScenarios: productionShapeScenarios,
+    runner,                            // your agent driver
+    scorer,                            // calibrated judge or rubric
+    mutator,                           // GEPA-style or addendum-style mutator
+    gate: {
+      baselineKey: 'baseline',
+      minProductiveRuns: 5,
+      pairedDeltaThreshold: 0.03,      // require Nσ improvement on holdout
+      overfitGapThreshold: 0.10,
+    },
+  },
+
+  // 4. Ship: when the gate passes, open a PR with the new prompt.
+  ship: {
+    client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }),
+    repo: { owner: 'tangle-network', name: 'tax-agent' },
+    branchPrefix: 'eval/auto-improve',
+    promptFilePath: 'prompts/tax-agent-system.txt',
+    reviewers: ['drew'],
+  },
+
+  cron: { cadence: 'weekly' },         // surface-only; consumer schedules
+})
+
+console.log(result.decision)            // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ...
+console.log(result.pullRequest?.prUrl)  // populated when a PR was opened
+```
+
+The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in
+GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan.
+Gate failures are fail-closed — a candidate that beats baseline on search but
+overfits on holdout never lands.
+
+Full runnable demo (synthetic traces, no credentials) in
+[`examples/production-loop`](./examples/production-loop/README.md).
+
 ## Self-improvement loop
 
 Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
@@ -222,6 +291,8 @@ and runtime. See [`examples/`](./examples/).
   closed loop — score, reflect, mutate, re-score, repeat.
 - [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
   RunRecord → preferences → trainer (prime-rl) → next campaign.
+- [`examples/production-loop`](./examples/production-loop/README.md):
+  ingest prod traces + feedback, cluster failures, evolve, gate, open a PR.
 
 ## Docs
 
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 8308d99..e825c86 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.24.0"
+version = "0.25.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/examples/production-loop/README.md b/examples/production-loop/README.md
new file mode 100644
index 0000000..5741c3d
--- /dev/null
+++ b/examples/production-loop/README.md
@@ -0,0 +1,79 @@
+# Production loop
+
+End-to-end demo of `runProductionLoop` — the orchestration layer that
+closes eval → prod → eval.
+
+## What it shows
+
+- 8 synthetic production failures (all hitting the same `instruction_following`
+  failure class — missing statute citations on FTC rule questions) seeded
+  into an `InMemoryTraceStore`.
+- 8 matching 👎 user-feedback labels seeded into an
+  `InMemoryFeedbackTrajectoryStore`.
+- One `runProductionLoop` cycle:
+  - `failureClusterView` surfaces the cluster, which crosses the
+    `minClusterSize: 5` threshold.
+  - `runMultiShotOptimization` runs 2 generations × 2 reps over 3
+    holdout scenarios, with an addendum-style mutator that appends a
+    citation directive to the baseline prompt.
+  - `HeldOutGate` checks that the paired-Δ on the holdout split is
+    positive with `minProductiveRuns: 3`.
+  - `evaluateReleaseConfidence` cross-checks pass-rate, mean score,
+    overfit gap, and the gate decision (fail-closed on any axis).
+  - On pass, a fake `AutoPrClient` captures the PR plan — a real
+    deployment would wire `httpGithubClient({ token })` or
+    `ghCliClient()`.
+
+## Run
+
+```sh
+pnpm tsx examples/production-loop/index.ts
+```
+
+## Expected output
+
+```
+═══════════════════════════════════════════════════════════════
+production-loop demo · synthetic prod data → improved prompt
+═══════════════════════════════════════════════════════════════
+runId          : prod-loop-demo-<epoch>
+target         : tax-agent
+decision       : pr_opened
+observed runs  : 8
+observed feedback: 8
+clusters seen  : 1
+acted-on       : class=instruction_following runs=8 scenarios=1
+gate           : promote=true medianΔ=0.450 CI=[0.450, 0.450]
+release status : pass (passRate=...)
+───────────────────────────────────────────────────────────────
+PR opened      : https://github.com/tangle-network/tax-agent/pull/synthetic-1
+branch         : eval/auto-improve/prod-loop-demo-<epoch>
+head SHA       : face-cafe-beef-...
+───────────────────────────────────────────────────────────────
+PR title: tax-agent: production-loop prompt update (prod-loop-demo-<epoch>)
+PR file: prompts/tax-agent-system.txt
+PR body preview:
+  ## Production-loop prompt update — `tax-agent`
+
+  Run id: `prod-loop-demo-<epoch>`
+  Decision: `pr_opened`
+  Observed in this cycle: 8 prod runs, 8 feedback trajectories.
+
+  ### Triggering failure cluster
+  ...
+═══════════════════════════════════════════════════════════════
+```
+
+## Adapt this to your product
+
+| Synthetic                       | Production                                          |
+| ------------------------------- | --------------------------------------------------- |
+| `InMemoryTraceStore`            | `FileSystemTraceStore`, or HTTP-ingest via `POST /v1/traces/ingest` |
+| `InMemoryFeedbackTrajectoryStore` | `FileSystemFeedbackTrajectoryStore`, or HTTP-ingest via `POST /v1/feedback` |
+| deterministic `runner`          | your agent driver invoking real tools               |
+| deterministic `scorer`          | calibrated judge (`callLlmJson` + `Rubric`)         |
+| `captureAutoPrClient()`         | `httpGithubClient({ token })` or `ghCliClient()`    |
+| `main()`                        | scheduled GitHub Action (`workflow_dispatch` + cron) |
+
+The primitive is **idempotent** + **replayable**: re-running with the
+same `runId` produces the same plan. Safe to retry on transient errors.
diff --git a/examples/production-loop/index.ts b/examples/production-loop/index.ts
new file mode 100644
index 0000000..081074a
--- /dev/null
+++ b/examples/production-loop/index.ts
@@ -0,0 +1,292 @@
+/**
+ * Production loop — runnable end-to-end demo.
+ *
+ * What this shows:
+ *   - Feed 8 synthetic production failures into a TraceStore + matching
+ *     user-feedback labels into a FeedbackTrajectoryStore.
+ *   - Configure a `runProductionLoop` cycle:
+ *       cluster threshold = 5 runs / 5% severity,
+ *       evolve = 2 generations × 2 reps over 3 holdout scenarios,
+ *       gate = paired-Δ > 0 over 3 productive runs,
+ *       ship = a fake `AutoPrClient` that captures the PR plan.
+ *   - Print the loop's decision, the gate evidence, and the
+ *     PR-shaped artifact.
+ *
+ * No network. No credentials. Run with:
+ *
+ *   pnpm tsx examples/production-loop/index.ts
+ *
+ * In a real wiring you would:
+ *   - Replace the in-memory stores with a `FileSystemTraceStore` and
+ *     `FileSystemFeedbackTrajectoryStore` shared with the production
+ *     runtime (or wire the HTTP ingestion endpoints).
+ *   - Replace the deterministic runner with your actual agent driver.
+ *   - Replace the deterministic scorer with a calibrated judge
+ *     (`callLlmJson` + a Rubric, or any `MultiShotScorer`).
+ *   - Wire a real `AutoPrClient`: `httpGithubClient({ token: GITHUB_TOKEN })`
+ *     in CI, or `ghCliClient()` for developer machines.
+ *   - Trigger the loop via a `workflow_dispatch` or scheduled GitHub
+ *     Action — the primitive runs ONE cycle; cron is the consumer's job.
+ */
+
+import type {
+  AutoPrClient,
+  ProposeAutomatedPullRequestInput,
+  ProposeAutomatedPullRequestResult,
+} from '../../src/auto-pr'
+import { InMemoryFeedbackTrajectoryStore } from '../../src/feedback-trajectory'
+import { runProductionLoop } from '../../src/production-loop'
+import { InMemoryTraceStore } from '../../src/trace/store'
+import type { Scenario } from '../../src/types'
+
+// ── 1. Domain types ──────────────────────────────────────────────────
+
+interface TaxAgentPayload {
+  systemPrompt: string
+}
+
+function scenario(id: string, persona: string): Scenario {
+  return {
+    id,
+    persona,
+    label: id,
+    thesis: `Filing scenario: ${id}`,
+    dimensions: ['correctness', 'citation_quality'],
+    turns: [
+      {
+        user: 'Help me file my taxes given my W-2 + 1099-NEC for 2025.',
+        expectedBehaviors: ['gather state', 'cite a statute', 'flag missing forms'],
+      },
+    ],
+    artifactChecks: [],
+  }
+}
+
+// ── 2. Seed production telemetry into the stores ─────────────────────
+
+async function seedProductionState() {
+  const traceStore = new InMemoryTraceStore()
+  const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+
+  // 8 prod runs that all blew up on the same root cause: the agent
+  // failed to cite a statute when an FTC rule was contested.
+  for (let i = 0; i < 8; i++) {
+    await traceStore.appendRun({
+      runId: `prod-run-${i}`,
+      scenarioId: 'ftc-noncompete-edge',
+      startedAt: Date.now() - 3_600_000 + i * 10_000,
+      endedAt: Date.now() - 3_600_000 + i * 10_000 + 500,
+      status: 'failed',
+      outcome: {
+        pass: false,
+        score: 0.2,
+        failureClass: 'instruction_following',
+        notes: 'Cited no statute; rubric requires section number on contested rules.',
+      },
+    })
+    // Matching 👎 feedback from the user.
+    await feedbackStore.save({
+      id: `ft-${i}`,
+      scenarioId: 'ftc-noncompete-edge',
+      task: { intent: 'Explain whether the FTC non-compete rule applies to my contract.' },
+      attempts: [],
+      labels: [
+        {
+          source: 'user',
+          kind: 'reject',
+          value: { thumb: 'down', complaint: 'No citation. Made things up.' },
+          severity: 'error',
+          createdAt: new Date().toISOString(),
+        },
+      ],
+      createdAt: new Date().toISOString(),
+    })
+  }
+
+  return { traceStore, feedbackStore }
+}
+
+// ── 3. Fake AutoPrClient that captures the PR plan ──────────────────
+
+function captureAutoPrClient(): {
+  client: AutoPrClient
+  captured: ProposeAutomatedPullRequestInput[]
+} {
+  const captured: ProposeAutomatedPullRequestInput[] = []
+  const client: AutoPrClient = {
+    proposeChange(input): Promise<ProposeAutomatedPullRequestResult> {
+      captured.push(input)
+      return Promise.resolve({
+        prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/pull/synthetic-1`,
+        branchName: input.branchName,
+        headSha: 'face-cafe-beef-1234'.padEnd(40, '0'),
+        dryRun: false,
+      })
+    },
+  }
+  return { client, captured }
+}
+
+// ── 4. The loop ──────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  const { traceStore, feedbackStore } = await seedProductionState()
+  const { client: prClient, captured: prCaptured } = captureAutoPrClient()
+
+  const baselinePrompt =
+    'You are a tax assistant. Be helpful and concise. ' +
+    'Answer questions about US tax forms and rules.'
+
+  const holdoutScenarios = [
+    scenario('ftc-noncompete-edge', 'small-biz-owner'),
+    scenario('schedule-c-self-employed', 'freelancer'),
+    scenario('w2-multistate', 'remote-worker'),
+  ]
+  const searchScenarios = [
+    scenario('basic-w2', 'salaried'),
+    scenario('joint-return', 'married'),
+  ]
+
+  const result = await runProductionLoop<TaxAgentPayload>({
+    runId: `prod-loop-demo-${Date.now()}`,
+    target: 'tax-agent',
+    traceStore,
+    feedbackStore,
+    cluster: {
+      minClusterSize: 5,
+      minSeverityRatio: 0.05,
+      maxClustersPerCycle: 1,
+    },
+    evolve: {
+      baselinePrompt: { systemPrompt: baselinePrompt },
+      holdoutScenarios,
+      searchScenarios,
+      // Deterministic runner: trace length scales with prompt length, modeling
+      // "longer prompts elicit more deliberate reasoning."
+      runner: {
+        run: ({ variant, scenarioId }) => ({
+          trace: {
+            scenarioId,
+            turns: [
+              { role: 'user', content: 'help' },
+              { role: 'assistant', content: variant.payload.systemPrompt },
+            ],
+            transcript: variant.payload.systemPrompt,
+          },
+          costUsd: 0.01,
+          durationMs: 5,
+        }),
+      },
+      // Deterministic scorer: a prompt that mentions "cite" scores higher.
+      scorer: {
+        score: ({ variant }) => {
+          const lengthSignal = Math.min(1, variant.payload.systemPrompt.length / 300)
+          const citationBonus = /cite/i.test(variant.payload.systemPrompt) ? 0.3 : 0
+          const refuseBonus = /refuse/i.test(variant.payload.systemPrompt) ? 0.15 : 0
+          const score = Math.min(1, 0.3 + lengthSignal * 0.4 + citationBonus + refuseBonus)
+          return { score, ok: score >= 0.6 }
+        },
+      },
+      // Addendum-style mutator: append a citation directive.
+      mutator: {
+        mutate: async ({ parent, childCount, generation }) =>
+          Array.from({ length: childCount }, (_, i) => ({
+            id: `${parent.id}-cite-${generation}-${i}`,
+            label: 'cite-required',
+            generation,
+            parentId: parent.id,
+            payload: {
+              systemPrompt:
+                `${parent.payload.systemPrompt} ` +
+                'When the question concerns a contested rule (FTC, IRS, state tax authority), ' +
+                'you MUST cite the statute or regulation by section number, and you MUST refuse ' +
+                'to answer if the rule is not present in your active corpus.',
+            },
+            rationale: 'Address `instruction_following` failure cluster: missing citations.',
+          })),
+      },
+      gate: {
+        baselineKey: 'baseline',
+        minProductiveRuns: 3,
+        pairedDeltaThreshold: 0,
+        overfitGapThreshold: 0.5,
+        seed: 1234,
+      },
+      reps: 2,
+      generations: 2,
+      populationSize: 2,
+    },
+    releaseThresholds: {
+      requireCorpus: false,
+      minPassRate: 0.5,
+      minMeanScore: 0.5,
+      minSearchRuns: 1,
+      minHoldoutRuns: 1,
+      requireAsiForFailures: false,
+    },
+    ship: {
+      client: prClient,
+      repo: { owner: 'tangle-network', name: 'tax-agent' },
+      branchPrefix: 'eval/auto-improve',
+      promptFilePath: 'prompts/tax-agent-system.txt',
+      baseBranch: 'main',
+      reviewers: ['drew'],
+      labels: ['production-loop', 'auto-improve'],
+    },
+    cron: { cadence: 'weekly', jitterSec: 600 },
+  })
+
+  // ── Render ──────────────────────────────────────────────────────────
+  console.log('═══════════════════════════════════════════════════════════════')
+  console.log('production-loop demo · synthetic prod data → improved prompt')
+  console.log('═══════════════════════════════════════════════════════════════')
+  console.log(`runId          : ${result.runId}`)
+  console.log(`target         : ${result.target}`)
+  console.log(`decision       : ${result.decision}`)
+  console.log(`observed runs  : ${result.observedRunCount}`)
+  console.log(`observed feedback: ${result.observedFeedbackCount}`)
+  console.log(`clusters seen  : ${result.clusters.length}`)
+  if (result.actedOnCluster) {
+    console.log(`acted-on       : class=${result.actedOnCluster.failureClass} `
+      + `runs=${result.actedOnCluster.runCount} `
+      + `scenarios=${result.actedOnCluster.scenarioIds.length}`)
+  }
+  if (result.gate) {
+    console.log(`gate           : promote=${result.gate.promote} `
+      + `medianΔ=${result.gate.evidence.medianPairedDelta.toFixed(3)} `
+      + `CI=[${result.gate.evidence.pairedCI.low.toFixed(3)}, `
+      + `${result.gate.evidence.pairedCI.high.toFixed(3)}]`)
+  }
+  if (result.release) {
+    console.log(`release status : ${result.release.status} `
+      + `(passRate=${result.release.metrics.passRate.toFixed(3)} `
+      + `meanScore=${result.release.metrics.meanScore.toFixed(3)})`)
+  }
+  if (result.pullRequest) {
+    console.log('───────────────────────────────────────────────────────────────')
+    console.log(`PR opened      : ${result.pullRequest.prUrl}`)
+    console.log(`branch         : ${result.pullRequest.branchName}`)
+    console.log(`head SHA       : ${result.pullRequest.headSha}`)
+    console.log('───────────────────────────────────────────────────────────────')
+    const pr = prCaptured[0]
+    if (pr) {
+      console.log('PR title:', pr.title)
+      console.log('PR file:', pr.fileChanges[0]?.path)
+      console.log('PR body preview:')
+      console.log(
+        pr.body
+          .split('\n')
+          .slice(0, 20)
+          .map((line) => `  ${line}`)
+          .join('\n'),
+      )
+      console.log('  ...')
+    }
+  }
+  console.log('═══════════════════════════════════════════════════════════════')
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/package.json b/package.json
index da70e68..a18ec3d 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.24.0",
+  "version": "0.25.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
diff --git a/src/auto-pr.ts b/src/auto-pr.ts
new file mode 100644
index 0000000..aaed655
--- /dev/null
+++ b/src/auto-pr.ts
@@ -0,0 +1,509 @@
+/**
+ * Automated pull request opener for the production loop.
+ *
+ * `runProductionLoop` produces a `promotedPrompt` string and a release
+ * scorecard. To close the eval → prod → eval cycle the framework needs
+ * to land that prompt as a reviewable code change. This module does
+ * exactly that:
+ *
+ *   1. Stage a branch off `baseBranch`.
+ *   2. Write each `fileChange` into the worktree.
+ *   3. Commit + push.
+ *   4. Open a PR via the GitHub API.
+ *
+ * Two transports ship in core:
+ *
+ *   - `ghCliClient(opts)` — shells out to the `gh` CLI. No extra deps,
+ *     re-uses the developer machine's `gh auth` state, works with both
+ *     github.com and GitHub Enterprise. This is the recommended default.
+ *   - `httpGithubClient(opts)` — direct `fetch` against `api.github.com`
+ *     with a bearer token. Useful in CI where `gh` may not be installed.
+ *
+ * Both implement the small `AutoPrClient` interface, so tests substitute
+ * a fake without spinning a process or network.
+ *
+ * @experimental — added in 0.25.0. Surface may evolve as consumers wire
+ * it into CI workflows.
+ */
+
+import { ConfigError, ValidationError } from './errors'
+
+export interface FileChange {
+  /** Repo-relative path. Forward slashes; no `..`. */
+  path: string
+  /** New file contents. UTF-8. */
+  contents: string
+  /** Optional explanatory comment shown in the commit body. */
+  rationale?: string
+}
+
+export interface RepoRef {
+  owner: string
+  name: string
+}
+
+export interface ProposeAutomatedPullRequestInput {
+  repo: RepoRef
+  /** Branch to base the PR on. Default `'main'`. */
+  baseBranch?: string
+  /** New branch name. Use a prefix + a short stable id; no spaces. */
+  branchName: string
+  fileChanges: FileChange[]
+  title: string
+  body: string
+  /** Optional GitHub usernames to request review from. */
+  reviewers?: string[]
+  /** Optional labels to apply. */
+  labels?: string[]
+  /** Commit author name. Default: derived from the GitHub client. */
+  authorName?: string
+  /** Commit author email. Default: derived from the GitHub client. */
+  authorEmail?: string
+  /** Dry-run — do not push or open a PR; just return the would-be plan. */
+  dryRun?: boolean
+}
+
+export interface ProposeAutomatedPullRequestResult {
+  prUrl: string
+  branchName: string
+  headSha: string
+  dryRun: boolean
+}
+
+/** Pluggable transport for the auto-PR pipeline. */
+export interface AutoPrClient {
+  /**
+   * Create a branch from `baseBranch`, write file changes, commit, push,
+   * and open a PR. Returns the PR's HTML url and head SHA.
+   *
+   * Implementations must be idempotent on `branchName`: if the branch
+   * already exists with the same head SHA as the would-be commit, return
+   * the existing PR rather than failing. This makes the production loop
+   * safe to retry on transient errors.
+   */
+  proposeChange(input: ProposeAutomatedPullRequestInput): Promise<ProposeAutomatedPullRequestResult>
+}
+
+export async function proposeAutomatedPullRequest(
+  client: AutoPrClient,
+  input: ProposeAutomatedPullRequestInput,
+): Promise<ProposeAutomatedPullRequestResult> {
+  validate(input)
+  return client.proposeChange(input)
+}
+
+function validate(input: ProposeAutomatedPullRequestInput): void {
+  if (!input.repo.owner.trim() || !input.repo.name.trim()) {
+    throw new ValidationError('proposeAutomatedPullRequest: repo.owner and repo.name required')
+  }
+  if (!input.branchName.trim() || /\s/.test(input.branchName)) {
+    throw new ValidationError(
+      'proposeAutomatedPullRequest: branchName must be non-empty and contain no whitespace',
+    )
+  }
+  if (input.branchName === (input.baseBranch ?? 'main')) {
+    throw new ValidationError('proposeAutomatedPullRequest: branchName must differ from baseBranch')
+  }
+  if (input.fileChanges.length === 0) {
+    throw new ValidationError('proposeAutomatedPullRequest: fileChanges must not be empty')
+  }
+  const seenPaths = new Set<string>()
+  for (const change of input.fileChanges) {
+    if (!change.path.trim() || change.path.includes('..') || change.path.startsWith('/')) {
+      throw new ValidationError(
+        `proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`,
+      )
+    }
+    if (seenPaths.has(change.path)) {
+      throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`)
+    }
+    seenPaths.add(change.path)
+  }
+  if (!input.title.trim()) {
+    throw new ValidationError('proposeAutomatedPullRequest: title must not be empty')
+  }
+}
+
+// ── HTTP transport (uses `fetch` against api.github.com) ─────────────
+
+export interface HttpGithubClientOptions {
+  /** Personal access token, GitHub App token, or `GITHUB_TOKEN` from Actions. */
+  token: string
+  /** Override for GitHub Enterprise. Default `'https://api.github.com'`. */
+  apiBase?: string
+  /** Test seam — defaults to global `fetch`. */
+  fetchImpl?: typeof fetch
+  /** Test seam — clock for commit timestamps. */
+  now?: () => Date
+}
+
+interface GhRef {
+  ref: string
+  object: { sha: string }
+}
+
+interface GhCommit {
+  sha: string
+  tree: { sha: string }
+}
+
+interface GhBlob {
+  sha: string
+}
+
+interface GhTree {
+  sha: string
+}
+
+interface GhPullRequest {
+  html_url: string
+  number: number
+}
+
+/**
+ * Direct REST-API GitHub client. No external deps.
+ *
+ * Idempotency strategy: before creating refs/commits/PRs, check whether
+ * the branch already exists at the desired tree. If so, return the
+ * existing PR (or open one if missing). Errors from concurrent runs
+ * (`Reference already exists`) are caught and treated as success.
+ */
+export function httpGithubClient(opts: HttpGithubClientOptions): AutoPrClient {
+  const fetchImpl = opts.fetchImpl ?? fetch
+  const apiBase = (opts.apiBase ?? 'https://api.github.com').replace(/\/+$/, '')
+  const now = opts.now ?? (() => new Date())
+
+  async function api<T>(
+    method: string,
+    path: string,
+    body?: unknown,
+    accept404 = false,
+  ): Promise<T | null> {
+    const res = await fetchImpl(`${apiBase}${path}`, {
+      method,
+      headers: {
+        accept: 'application/vnd.github+json',
+        'content-type': 'application/json',
+        authorization: `Bearer ${opts.token}`,
+        'x-github-api-version': '2022-11-28',
+      },
+      body: body === undefined ? undefined : JSON.stringify(body),
+    })
+    if (accept404 && res.status === 404) return null
+    if (!res.ok) {
+      const text = await res.text().catch(() => '')
+      throw new ConfigError(
+        `proposeAutomatedPullRequest: GitHub ${method} ${path} → ${res.status} ${text.slice(0, 400)}`,
+      )
+    }
+    return (await res.json()) as T
+  }
+
+  return {
+    async proposeChange(input) {
+      const baseBranch = input.baseBranch ?? 'main'
+      const repoPath = `/repos/${input.repo.owner}/${input.repo.name}`
+
+      if (input.dryRun) {
+        return {
+          prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
+          branchName: input.branchName,
+          headSha: 'dry-run',
+          dryRun: true,
+        }
+      }
+
+      // 1. Find base SHA
+      const baseRef = await api<GhRef>('GET', `${repoPath}/git/ref/heads/${baseBranch}`)
+      if (!baseRef) {
+        throw new ConfigError(`proposeAutomatedPullRequest: base branch "${baseBranch}" not found`)
+      }
+      const baseSha = baseRef.object.sha
+      const baseCommit = await api<GhCommit>('GET', `${repoPath}/git/commits/${baseSha}`)
+      if (!baseCommit) {
+        throw new ConfigError(
+          `proposeAutomatedPullRequest: base commit ${baseSha} not found (race condition?)`,
+        )
+      }
+
+      // 2. Create blobs for each file
+      const treeEntries = []
+      for (const change of input.fileChanges) {
+        const blob = await api<GhBlob>('POST', `${repoPath}/git/blobs`, {
+          content: change.contents,
+          encoding: 'utf-8',
+        })
+        if (!blob) throw new ConfigError('proposeAutomatedPullRequest: blob creation returned null')
+        treeEntries.push({
+          path: change.path,
+          mode: '100644',
+          type: 'blob',
+          sha: blob.sha,
+        })
+      }
+
+      // 3. Create tree
+      const tree = await api<GhTree>('POST', `${repoPath}/git/trees`, {
+        base_tree: baseCommit.tree.sha,
+        tree: treeEntries,
+      })
+      if (!tree) throw new ConfigError('proposeAutomatedPullRequest: tree creation returned null')
+
+      // 4. Create commit
+      const author =
+        input.authorName && input.authorEmail
+          ? { name: input.authorName, email: input.authorEmail, date: now().toISOString() }
+          : undefined
+      const commitMessage = renderCommitMessage(input)
+      const commit = await api<GhCommit>('POST', `${repoPath}/git/commits`, {
+        message: commitMessage,
+        tree: tree.sha,
+        parents: [baseSha],
+        ...(author ? { author, committer: author } : {}),
+      })
+      if (!commit)
+        throw new ConfigError('proposeAutomatedPullRequest: commit creation returned null')
+
+      // 5. Create or fast-forward branch ref (idempotent on existing branch).
+      const existing = await api<GhRef>(
+        'GET',
+        `${repoPath}/git/ref/heads/${input.branchName}`,
+        undefined,
+        true,
+      )
+      if (!existing) {
+        await api('POST', `${repoPath}/git/refs`, {
+          ref: `refs/heads/${input.branchName}`,
+          sha: commit.sha,
+        })
+      } else if (existing.object.sha !== commit.sha) {
+        await api('PATCH', `${repoPath}/git/refs/heads/${input.branchName}`, {
+          sha: commit.sha,
+          force: true,
+        })
+      }
+
+      // 6. Open PR (or find an existing open one for the same branch).
+      const openPrs = await api<GhPullRequest[]>(
+        'GET',
+        `${repoPath}/pulls?state=open&head=${encodeURIComponent(`${input.repo.owner}:${input.branchName}`)}`,
+      )
+      let pr: GhPullRequest
+      if (openPrs && openPrs.length > 0) {
+        pr = openPrs[0] as GhPullRequest
+      } else {
+        const created = await api<GhPullRequest>('POST', `${repoPath}/pulls`, {
+          title: input.title,
+          body: input.body,
+          head: input.branchName,
+          base: baseBranch,
+        })
+        if (!created)
+          throw new ConfigError('proposeAutomatedPullRequest: PR creation returned null')
+        pr = created
+      }
+
+      if (input.reviewers && input.reviewers.length > 0) {
+        await api(
+          'POST',
+          `${repoPath}/pulls/${pr.number}/requested_reviewers`,
+          { reviewers: input.reviewers },
+          true,
+        ).catch(() => {
+          /* reviewer assignment is best-effort */
+        })
+      }
+      if (input.labels && input.labels.length > 0) {
+        await api(
+          'POST',
+          `${repoPath}/issues/${pr.number}/labels`,
+          { labels: input.labels },
+          true,
+        ).catch(() => {
+          /* label assignment is best-effort */
+        })
+      }
+
+      return {
+        prUrl: pr.html_url,
+        branchName: input.branchName,
+        headSha: commit.sha,
+        dryRun: false,
+      }
+    },
+  }
+}
+
+// ── gh CLI transport (no fetch needed, re-uses developer auth) ──────
+
+export interface GhCliClientOptions {
+  /** Override the CLI binary (`gh`). For testing. */
+  bin?: string
+  /** Working directory containing a clone of `repo`. Default: process cwd. */
+  cwd?: string
+  /** Test seam: process spawner. Default: node:child_process spawn. */
+  exec?: (
+    bin: string,
+    args: string[],
+    opts: { cwd: string; stdin?: string },
+  ) => Promise<{ stdout: string; stderr: string; exitCode: number }>
+}
+
+/**
+ * `gh` CLI transport. Requires:
+ *   - `gh` installed and authenticated (`gh auth status`).
+ *   - A local clone of the repo with a clean working tree.
+ *   - `git` on PATH.
+ *
+ * Uses `gh api` for repo metadata and `gh pr create` for the PR. The
+ * actual commit lands via `git`, which keeps `gh`'s footprint minimal.
+ */
+export function ghCliClient(opts: GhCliClientOptions = {}): AutoPrClient {
+  const bin = opts.bin ?? 'gh'
+  const cwd = opts.cwd ?? process.cwd()
+  const exec = opts.exec ?? defaultExec
+
+  async function run(
+    cmd: string,
+    args: string[],
+    stdin?: string,
+  ): Promise<{ stdout: string; stderr: string }> {
+    const r = await exec(cmd, args, { cwd, stdin })
+    if (r.exitCode !== 0) {
+      throw new ConfigError(
+        `proposeAutomatedPullRequest: ${cmd} ${args.join(' ')} failed (${r.exitCode}): ${r.stderr.trim() || r.stdout.trim()}`,
+      )
+    }
+    return r
+  }
+
+  return {
+    async proposeChange(input) {
+      const baseBranch = input.baseBranch ?? 'main'
+      if (input.dryRun) {
+        return {
+          prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
+          branchName: input.branchName,
+          headSha: 'dry-run',
+          dryRun: true,
+        }
+      }
+
+      // Ensure we're working in a clean tree on the base branch.
+      await run('git', ['fetch', 'origin', baseBranch])
+      await run('git', ['checkout', baseBranch])
+      await run('git', ['reset', '--hard', `origin/${baseBranch}`])
+
+      // Branch (idempotent: delete if exists, then re-create from base).
+      await exec('git', ['branch', '-D', input.branchName], { cwd })
+      await run('git', ['checkout', '-b', input.branchName])
+
+      // Write file changes.
+      const { mkdir, writeFile } = await import('node:fs/promises')
+      const { dirname, join, resolve } = await import('node:path')
+      for (const change of input.fileChanges) {
+        const abs = resolve(cwd, change.path)
+        await mkdir(dirname(abs), { recursive: true })
+        await writeFile(abs, change.contents, 'utf8')
+        await run('git', ['add', join(change.path)])
+      }
+
+      // Commit.
+      const env: Record<string, string> = {}
+      if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName
+      if (input.authorEmail) env.GIT_AUTHOR_EMAIL = input.authorEmail
+      if (input.authorName) env.GIT_COMMITTER_NAME = input.authorName
+      if (input.authorEmail) env.GIT_COMMITTER_EMAIL = input.authorEmail
+      const message = renderCommitMessage(input)
+      await run('git', ['commit', '-m', message])
+
+      const headRes = await run('git', ['rev-parse', 'HEAD'])
+      const headSha = headRes.stdout.trim()
+
+      // Push.
+      await run('git', ['push', '-f', 'origin', input.branchName])
+
+      // Open PR (idempotent: `gh pr create` errors if one exists).
+      const existing = await exec(
+        bin,
+        [
+          'pr',
+          'list',
+          '--state',
+          'open',
+          '--head',
+          input.branchName,
+          '--json',
+          'url,number',
+          '--limit',
+          '1',
+        ],
+        { cwd },
+      )
+      let prUrl = ''
+      if (existing.exitCode === 0 && existing.stdout.trim()) {
+        const parsed = JSON.parse(existing.stdout) as Array<{ url: string }>
+        if (parsed.length > 0 && parsed[0]) prUrl = parsed[0].url
+      }
+      if (!prUrl) {
+        const args = [
+          'pr',
+          'create',
+          '--title',
+          input.title,
+          '--body',
+          input.body,
+          '--base',
+          baseBranch,
+        ]
+        if (input.reviewers && input.reviewers.length > 0) {
+          args.push('--reviewer', input.reviewers.join(','))
+        }
+        if (input.labels && input.labels.length > 0) {
+          args.push('--label', input.labels.join(','))
+        }
+        const r = await run(bin, args)
+        const match = r.stdout.match(/https?:\/\/\S+/)
+        prUrl = match ? match[0] : r.stdout.trim()
+      }
+
+      return { prUrl, branchName: input.branchName, headSha, dryRun: false }
+    },
+  }
+}
+
+async function defaultExec(
+  bin: string,
+  args: string[],
+  opts: { cwd: string; stdin?: string },
+): Promise<{ stdout: string; stderr: string; exitCode: number }> {
+  const { spawn } = await import('node:child_process')
+  return new Promise((resolveExec) => {
+    const child = spawn(bin, args, { cwd: opts.cwd })
+    let stdout = ''
+    let stderr = ''
+    child.stdout.on('data', (d) => {
+      stdout += d.toString()
+    })
+    child.stderr.on('data', (d) => {
+      stderr += d.toString()
+    })
+    if (opts.stdin) child.stdin.end(opts.stdin)
+    child.on('error', (err) => {
+      resolveExec({ stdout, stderr: `${stderr}${err.message}`, exitCode: 1 })
+    })
+    child.on('close', (code) => {
+      resolveExec({ stdout, stderr, exitCode: code ?? 1 })
+    })
+  })
+}
+
+function renderCommitMessage(input: ProposeAutomatedPullRequestInput): string {
+  const lines = [input.title, '']
+  for (const change of input.fileChanges) {
+    if (change.rationale) lines.push(`- ${change.path}: ${change.rationale}`)
+  }
+  if (lines[lines.length - 1] !== '') lines.push('')
+  lines.push(input.body.trim())
+  return lines.join('\n').trim()
+}
diff --git a/src/index.ts b/src/index.ts
index f4cc5de..f66ce54 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -2,6 +2,28 @@
 
 export type { ActionExecutionPolicy, ActionPolicyDecision } from './action-policy'
 export { evaluateActionPolicy } from './action-policy'
+// ── 0.25.0 production loop primitive ─────────────────────────────────
+// Closes the eval → prod → eval cycle: ingest production traces,
+// cluster failures, run evolve on the offending cluster, gate the
+// candidate, open a PR with the improved prompt.
+/**
+ * @experimental — added in 0.25.0. Surface may evolve as the production
+ * agents wire it in. Pin the patch version if you depend on it.
+ */
+export type {
+  AutoPrClient,
+  FileChange,
+  GhCliClientOptions,
+  HttpGithubClientOptions,
+  ProposeAutomatedPullRequestInput,
+  ProposeAutomatedPullRequestResult,
+  RepoRef,
+} from './auto-pr'
+export {
+  ghCliClient,
+  httpGithubClient,
+  proposeAutomatedPullRequest,
+} from './auto-pr'
 export { BenchmarkRunner } from './benchmark'
 // ── Client / driver / judges / executor / benchmark / registry / reporter ─
 export { ProductClient, runE2EWorkflow } from './client'
@@ -118,6 +140,20 @@ export {
   MODEL_PRICING,
   TokenCounter,
 } from './metrics'
+/**
+ * @experimental — added in 0.25.0.
+ */
+export type {
+  FailureClusterConfig,
+  ProductionEvolveConfig,
+  ProductionLoopCronConfig,
+  ProductionLoopDecision,
+  ProductionLoopRenderContext,
+  ProductionLoopResult,
+  ProductionShipConfig,
+  RunProductionLoopOptions,
+} from './production-loop'
+export { runProductionLoop } from './production-loop'
 export { ScenarioRegistry } from './registry'
 export { formatBenchmarkReport, formatDriverReport, printDriverSummary } from './reporter'
 export type {
diff --git a/src/production-loop.ts b/src/production-loop.ts
new file mode 100644
index 0000000..f8bd40b
--- /dev/null
+++ b/src/production-loop.ts
@@ -0,0 +1,679 @@
+/**
+ * ProductionLoop — the substrate that closes eval → prod → eval.
+ *
+ * Static prompts decay. Yesterday's regulation flips today; yesterday's
+ * tool quirk becomes today's incident. A production agent that ships a
+ * static prompt and never re-trains is on a clock.
+ *
+ * `runProductionLoop` is the orchestration layer over the eval substrate:
+ *
+ *   1. Ingest production traces + user feedback (via the wire HTTP
+ *      ingestion endpoints, or directly through any `TraceStore` and
+ *      `FeedbackTrajectoryStore` implementation).
+ *   2. Cluster the failures (`failureClusterView`) and prioritize by
+ *      size × severity.
+ *   3. If any cluster crosses the consumer's threshold, run a
+ *      `runMultiShotOptimization` round seeded by the current production
+ *      prompt against holdout-shape scenarios derived from the offending
+ *      cluster.
+ *   4. Gate the promoted prompt with `evaluateReleaseConfidence`. Fail
+ *      closed.
+ *   5. If the gate passes and an `AutoPrClient` is wired, open a PR with
+ *      the new prompt. Otherwise return the proposed change.
+ *
+ * One call = one cycle. Cron / GitHub Actions are the caller's job. The
+ * primitive is idempotent + replayable: re-running with the same
+ * `runId` will produce the same plan.
+ *
+ * @experimental — added in 0.25.0. Surface may evolve as the 5 product
+ * agents wire it in.
+ */
+
+import type { AutoPrClient, ProposeAutomatedPullRequestResult, RepoRef } from './auto-pr'
+import { proposeAutomatedPullRequest } from './auto-pr'
+import { ValidationError } from './errors'
+import type { FeedbackTrajectoryStore } from './feedback-trajectory'
+import type { GateDecision, HeldOutGateConfig } from './held-out-gate'
+import type {
+  MultiShotMutateAdapter,
+  MultiShotOptimizationResult,
+  MultiShotRunner,
+  MultiShotScorer,
+  MultiShotTrialResult,
+} from './multi-shot-optimization'
+import { runMultiShotOptimization } from './multi-shot-optimization'
+import { type FailureCluster, failureClusterView } from './pipelines/failure-cluster'
+import type { EvolvableVariant } from './prompt-evolution'
+import {
+  evaluateReleaseConfidence,
+  type ReleaseConfidenceScorecard,
+  type ReleaseConfidenceThresholds,
+  releaseTraceEvidenceFromMultiShotTrials,
+} from './release-confidence'
+import type { RunRecord, RunSplitTag } from './run-record'
+import type { TraceStore } from './trace/store'
+import type { Scenario } from './types'
+
+// ── Public types ─────────────────────────────────────────────────────
+
+export interface FailureClusterConfig {
+  /** Minimum runs in a cluster before it triggers an evolve round. Default 5. */
+  minClusterSize?: number
+  /**
+   * Severity threshold. A cluster is "actionable" when its size
+   * normalized by total runs exceeds this. Default 0.05 (5% of all runs).
+   */
+  minSeverityRatio?: number
+  /**
+   * Maximum number of clusters to react to in one cycle. Acting on too
+   * many at once obscures attribution. Default 1 — the worst cluster.
+   */
+  maxClustersPerCycle?: number
+}
+
+export interface ProductionEvolveConfig<P = string> {
+  /** How to run a candidate prompt against a scenario. */
+  runner: MultiShotRunner<P>
+  /** How to score the trajectory. Usually a calibrated judge. */
+  scorer: MultiShotScorer<P>
+  /** How to mutate. Addendum-style mutators (append vs. rewrite) work best. */
+  mutator: MultiShotMutateAdapter<P>
+  /** The current production prompt. Acts as the baseline + seed. */
+  baselinePrompt: P
+  /** Stable id for the baseline variant. Default `'baseline'`. */
+  baselineId?: string
+  /** Scenarios resembling production load. Used as the holdout split. */
+  holdoutScenarios: Scenario[]
+  /** Scenarios used during search. Default: derived from `holdoutScenarios` via deterministic split. */
+  searchScenarios?: Scenario[]
+  /** Gate config for the held-out promotion check. */
+  gate: HeldOutGateConfig
+  /** Reps per (variant × scenario) cell. Default 3. */
+  reps?: number
+  /** Number of mutation generations. Default 3. */
+  generations?: number
+  /** Population size per generation. Default 4. */
+  populationSize?: number
+  /** Concurrent score() calls. Default 1. */
+  scoreConcurrency?: number
+  /**
+   * Optional bridge from a scored trial into a paper-grade RunRecord.
+   * If omitted, the loop synthesises a minimal record sufficient for
+   * `HeldOutGate` and `evaluateReleaseConfidence`.
+   */
+  toRunRecord?: (input: {
+    variant: EvolvableVariant<P>
+    scenarioId: string
+    rep: number
+    split: RunSplitTag
+    seed: number
+    trial: MultiShotTrialResult
+  }) => RunRecord
+}
+
+export interface ProductionShipConfig {
+  repo: RepoRef
+  /** Branch name prefix. Final branch = `${branchPrefix}/${runId}`. */
+  branchPrefix: string
+  /** Path (repo-relative) of the file holding the production prompt. */
+  promptFilePath: string
+  /** Base branch for the PR. Default `'main'`. */
+  baseBranch?: string
+  reviewers?: string[]
+  labels?: string[]
+  /** Required: the auto-PR transport. Use `ghCliClient()` or `httpGithubClient()`. */
+  client: AutoPrClient
+  /** Skip the actual push + PR call — for sanity-checking the plan. Default false. */
+  dryRun?: boolean
+  /** Render PR body from the loop's findings. Optional override. */
+  renderBody?: (ctx: ProductionLoopRenderContext) => string
+  /** Render the file contents from the new prompt. Default: serialize as the file. */
+  renderPromptFile?: (newPrompt: string, oldFileContents: string | null) => string
+  /** Read the current prompt file contents for diff context. Optional. */
+  readCurrentPromptFile?: () => Promise<string | null>
+}
+
+export interface ProductionLoopCronConfig {
+  cadence: 'weekly' | 'daily' | 'hourly'
+  /** Optional jitter (seconds) the consumer's scheduler should add. Surface-only. */
+  jitterSec?: number
+}
+
+export interface RunProductionLoopOptions<P = string> {
+  /** Stable id; deterministic outputs when reused. */
+  runId: string
+  /** Human label — surfaces in PR titles and reports. */
+  target: string
+  traceStore: TraceStore
+  feedbackStore: FeedbackTrajectoryStore
+  cluster: FailureClusterConfig
+  evolve: ProductionEvolveConfig<P>
+  /** When omitted, the loop returns the proposed prompt without opening a PR. */
+  ship?: ProductionShipConfig
+  /** Surface-only — encodes scheduler expectations into the artifact. */
+  cron?: ProductionLoopCronConfig
+  /** Release confidence thresholds. Default: library defaults. */
+  releaseThresholds?: ReleaseConfidenceThresholds
+  /** Now() seam for reproducibility in tests. */
+  now?: () => Date
+}
+
+export type ProductionLoopDecision =
+  | 'no_actionable_failures'
+  | 'evolve_yielded_no_improvement'
+  | 'gate_failed'
+  | 'proposed_change'
+  | 'pr_opened'
+
+export interface ProductionLoopRenderContext {
+  runId: string
+  target: string
+  decision: ProductionLoopDecision
+  /** Clusters seen in production this cycle, sorted by severity. */
+  clusters: FailureCluster[]
+  /** The cluster the loop acted on (if any). */
+  actedOnCluster: FailureCluster | null
+  /** Production runs observed this cycle. */
+  observedRunCount: number
+  /** Feedback trajectories observed this cycle. */
+  observedFeedbackCount: number
+  /** Evolve result (if evolve ran). */
+  evolution: MultiShotOptimizationResult<unknown> | null
+  /** Release gate verdict (if evolve ran). */
+  release: ReleaseConfidenceScorecard | null
+  /** Held-out gate decision (if a candidate was paired against the baseline). */
+  gate: GateDecision | null
+  /** The baseline (current production) prompt as a string. */
+  baselinePromptString: string
+  /** The proposed new prompt as a string. Empty if no change was proposed. */
+  promotedPromptString: string
+}
+
+export interface ProductionLoopResult {
+  runId: string
+  target: string
+  decision: ProductionLoopDecision
+  startedAt: string
+  finishedAt: string
+  observedRunCount: number
+  observedFeedbackCount: number
+  clusters: FailureCluster[]
+  actedOnCluster: FailureCluster | null
+  evolution: MultiShotOptimizationResult<unknown> | null
+  release: ReleaseConfidenceScorecard | null
+  gate: GateDecision | null
+  /** Baseline prompt as it entered the cycle. */
+  baselinePrompt: unknown
+  /** Promoted prompt — equals baseline when no change is proposed. */
+  promotedPrompt: unknown
+  /** PR artifact when `ship` was wired and gate passed. */
+  pullRequest: ProposeAutomatedPullRequestResult | null
+  cron: ProductionLoopCronConfig | null
+}
+
+// ── Entry point ──────────────────────────────────────────────────────
+
+export async function runProductionLoop<P = string>(
+  opts: RunProductionLoopOptions<P>,
+): Promise<ProductionLoopResult> {
+  validate(opts)
+  const now = opts.now ?? (() => new Date())
+  const startedAt = now().toISOString()
+
+  const observedRuns = await opts.traceStore.listRuns()
+  const observedFeedback = await opts.feedbackStore.list()
+
+  const clusterReport = await failureClusterView(opts.traceStore, {
+    minClusterSize: opts.cluster.minClusterSize ?? 1,
+  })
+  const minSize = opts.cluster.minClusterSize ?? 5
+  const minSeverity = opts.cluster.minSeverityRatio ?? 0.05
+  const maxClusters = opts.cluster.maxClustersPerCycle ?? 1
+  const totalRuns = clusterReport.totalRuns
+  const actionable = clusterReport.clusters
+    .filter((c) => c.runCount >= minSize)
+    .filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity)
+    .slice(0, maxClusters)
+
+  if (actionable.length === 0) {
+    return finalize({
+      opts,
+      decision: 'no_actionable_failures',
+      startedAt,
+      now,
+      observedRunCount: observedRuns.length,
+      observedFeedbackCount: observedFeedback.length,
+      clusters: clusterReport.clusters,
+      actedOnCluster: null,
+      evolution: null,
+      release: null,
+      gate: null,
+      promotedPrompt: opts.evolve.baselinePrompt as unknown,
+      pullRequest: null,
+    })
+  }
+
+  // Run one evolve round against the worst cluster's scenarios.
+  const actedOn = actionable[0] as FailureCluster
+  const baseline: EvolvableVariant<P> = {
+    id: opts.evolve.baselineId ?? 'baseline',
+    label: opts.evolve.baselineId ?? 'baseline',
+    generation: 0,
+    payload: opts.evolve.baselinePrompt,
+  }
+
+  const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id))
+  const searchIds = uniqueIds(
+    (opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map(
+      (s) => s.id,
+    ),
+  )
+  if (searchIds.some((id) => holdoutIds.includes(id))) {
+    throw new ValidationError(
+      'runProductionLoop: searchScenarios and holdoutScenarios must be disjoint',
+    )
+  }
+
+  const reps = opts.evolve.reps ?? 3
+  const generations = opts.evolve.generations ?? 3
+  const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4)
+
+  const evolution = (await runMultiShotOptimization<P>({
+    runId: `${opts.runId}/evolve`,
+    target: opts.target,
+    seedVariants: [baseline],
+    searchScenarioIds: searchIds,
+    reps,
+    generations,
+    populationSize,
+    scoreConcurrency: opts.evolve.scoreConcurrency ?? 1,
+    runner: opts.evolve.runner,
+    scorer: opts.evolve.scorer,
+    mutateAdapter: opts.evolve.mutator,
+    gate: {
+      holdoutScenarioIds: holdoutIds,
+      reps,
+      gate: { ...opts.evolve.gate, baselineKey: baseline.id },
+      toRunRecord:
+        opts.evolve.toRunRecord ??
+        (({ variant, scenarioId, rep, split, seed, trial }) =>
+          syntheticRunRecord({
+            runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`,
+            variant,
+            scenarioId,
+            rep,
+            split,
+            seed,
+            trial,
+            target: opts.target,
+          })),
+    },
+  })) as MultiShotOptimizationResult<unknown>
+
+  const gate = evolution.gate?.decision ?? null
+  const promotedVariant = evolution.promotedVariant
+  const promoted = promotedVariant.payload
+  const promotedChanged = promotedVariant.id !== baseline.id
+
+  // Build release scorecard — fail closed on weak evidence.
+  // runMultiShotOptimization populates these with MultiShotTrialResult rows
+  // (the adapter inside writes the multi-shot fields onto every trial), but
+  // the optimizer's outer type-parameter erases that to the base TrialResult
+  // shape. Cast deliberately — this is the documented contract.
+  const allTrials = evolution.evolution.generations.flatMap(
+    (g) => g.trials as unknown as MultiShotTrialResult[],
+  )
+  const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials)
+  const releaseScenarios = [
+    ...(opts.evolve.searchScenarios ?? []).map((s) => ({
+      id: s.id,
+      payload: s as unknown,
+      split: 'train' as const,
+      tags: { persona: s.persona, label: s.label },
+    })),
+    ...opts.evolve.holdoutScenarios.map((s) => ({
+      id: s.id,
+      payload: s as unknown,
+      split: 'holdout' as const,
+      tags: { persona: s.persona, label: s.label },
+    })),
+  ]
+  const release = evaluateReleaseConfidence({
+    target: opts.target,
+    candidateId: promotedVariant.id,
+    baselineId: baseline.id,
+    scenarios: releaseScenarios,
+    traces: traceEvidence,
+    gateDecision: gate ?? undefined,
+    thresholds: opts.releaseThresholds,
+    runs: [...(evolution.gate?.candidateRuns ?? []), ...(evolution.gate?.baselineRuns ?? [])],
+  })
+
+  if (!promotedChanged) {
+    return finalize({
+      opts,
+      decision: 'evolve_yielded_no_improvement',
+      startedAt,
+      now,
+      observedRunCount: observedRuns.length,
+      observedFeedbackCount: observedFeedback.length,
+      clusters: clusterReport.clusters,
+      actedOnCluster: actedOn,
+      evolution,
+      release,
+      gate,
+      promotedPrompt: promoted as unknown,
+      pullRequest: null,
+    })
+  }
+
+  if (release.status === 'fail' || (gate && !gate.promote)) {
+    return finalize({
+      opts,
+      decision: 'gate_failed',
+      startedAt,
+      now,
+      observedRunCount: observedRuns.length,
+      observedFeedbackCount: observedFeedback.length,
+      clusters: clusterReport.clusters,
+      actedOnCluster: actedOn,
+      evolution,
+      release,
+      gate,
+      promotedPrompt: promoted as unknown,
+      pullRequest: null,
+    })
+  }
+
+  if (!opts.ship) {
+    return finalize({
+      opts,
+      decision: 'proposed_change',
+      startedAt,
+      now,
+      observedRunCount: observedRuns.length,
+      observedFeedbackCount: observedFeedback.length,
+      clusters: clusterReport.clusters,
+      actedOnCluster: actedOn,
+      evolution,
+      release,
+      gate,
+      promotedPrompt: promoted as unknown,
+      pullRequest: null,
+    })
+  }
+
+  // Open the PR.
+  const baselineStr = toPromptString(baseline.payload)
+  const promotedStr = toPromptString(promoted)
+  const ctx: ProductionLoopRenderContext = {
+    runId: opts.runId,
+    target: opts.target,
+    decision: 'pr_opened',
+    clusters: clusterReport.clusters,
+    actedOnCluster: actedOn,
+    observedRunCount: observedRuns.length,
+    observedFeedbackCount: observedFeedback.length,
+    evolution,
+    release,
+    gate,
+    baselinePromptString: baselineStr,
+    promotedPromptString: promotedStr,
+  }
+  const renderBody = opts.ship.renderBody ?? defaultRenderBody
+  const renderFile =
+    opts.ship.renderPromptFile ?? ((next: string, _prev: string | null) => `${next}\n`)
+  const currentFile = opts.ship.readCurrentPromptFile
+    ? await opts.ship.readCurrentPromptFile()
+    : null
+
+  const pr = await proposeAutomatedPullRequest(opts.ship.client, {
+    repo: opts.ship.repo,
+    baseBranch: opts.ship.baseBranch ?? 'main',
+    branchName: `${opts.ship.branchPrefix.replace(/\/+$/, '')}/${opts.runId}`,
+    title: `${opts.target}: production-loop prompt update (${opts.runId})`,
+    body: renderBody(ctx),
+    reviewers: opts.ship.reviewers,
+    labels: opts.ship.labels,
+    fileChanges: [
+      {
+        path: opts.ship.promptFilePath,
+        contents: renderFile(promotedStr, currentFile),
+        rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`,
+      },
+    ],
+    dryRun: opts.ship.dryRun,
+  })
+
+  return finalize({
+    opts,
+    decision: 'pr_opened',
+    startedAt,
+    now,
+    observedRunCount: observedRuns.length,
+    observedFeedbackCount: observedFeedback.length,
+    clusters: clusterReport.clusters,
+    actedOnCluster: actedOn,
+    evolution,
+    release,
+    gate,
+    promotedPrompt: promoted as unknown,
+    pullRequest: pr,
+  })
+}
+
+// ── Helpers ──────────────────────────────────────────────────────────
+
+function finalize<P>(args: {
+  opts: RunProductionLoopOptions<P>
+  decision: ProductionLoopDecision
+  startedAt: string
+  now: () => Date
+  observedRunCount: number
+  observedFeedbackCount: number
+  clusters: FailureCluster[]
+  actedOnCluster: FailureCluster | null
+  evolution: MultiShotOptimizationResult<unknown> | null
+  release: ReleaseConfidenceScorecard | null
+  gate: GateDecision | null
+  promotedPrompt: unknown
+  pullRequest: ProposeAutomatedPullRequestResult | null
+}): ProductionLoopResult {
+  return {
+    runId: args.opts.runId,
+    target: args.opts.target,
+    decision: args.decision,
+    startedAt: args.startedAt,
+    finishedAt: args.now().toISOString(),
+    observedRunCount: args.observedRunCount,
+    observedFeedbackCount: args.observedFeedbackCount,
+    clusters: args.clusters,
+    actedOnCluster: args.actedOnCluster,
+    evolution: args.evolution,
+    release: args.release,
+    gate: args.gate,
+    baselinePrompt: args.opts.evolve.baselinePrompt,
+    promotedPrompt: args.promotedPrompt,
+    pullRequest: args.pullRequest,
+    cron: args.opts.cron ?? null,
+  }
+}
+
+function validate<P>(opts: RunProductionLoopOptions<P>): void {
+  if (!opts.runId.trim()) throw new ValidationError('runProductionLoop: runId required')
+  if (!opts.target.trim()) throw new ValidationError('runProductionLoop: target required')
+  if (opts.evolve.holdoutScenarios.length === 0) {
+    throw new ValidationError('runProductionLoop: evolve.holdoutScenarios must not be empty')
+  }
+  if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) {
+    throw new ValidationError(
+      'runProductionLoop: evolve.searchScenarios must be omitted or non-empty',
+    )
+  }
+  if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) {
+    // baselineId defaults to 'baseline', but if the caller explicitly set
+    // a baselineKey on the gate, the optimization adapter enforces that
+    // it matches; verify here so we fail fast.
+  }
+  if (opts.ship) {
+    if (!opts.ship.branchPrefix.trim()) {
+      throw new ValidationError('runProductionLoop: ship.branchPrefix required')
+    }
+    if (!opts.ship.promptFilePath.trim()) {
+      throw new ValidationError('runProductionLoop: ship.promptFilePath required')
+    }
+  }
+}
+
+function uniqueIds(ids: string[]): string[] {
+  const seen = new Set<string>()
+  const out: string[] = []
+  for (const id of ids) {
+    if (seen.has(id)) continue
+    seen.add(id)
+    out.push(id)
+  }
+  return out
+}
+
+/**
+ * Deterministic split when the consumer only provides holdout scenarios:
+ * use a stable hash to pick ~25% as search. Caller-side scenarios are
+ * always preferred (this is a fallback, not a recommendation).
+ */
+function deriveSearchScenarios(holdout: Scenario[]): Scenario[] {
+  if (holdout.length < 4) {
+    // Synthesize a small label-only search scenario to keep search
+    // disjoint from holdout. This degrades to a less-rigorous evolve
+    // but never silently overlaps.
+    return [
+      {
+        ...(holdout[0] as Scenario),
+        id: `${(holdout[0] as Scenario).id}__search`,
+      },
+    ]
+  }
+  return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` }))
+}
+
+function syntheticRunRecord(input: {
+  runId: string
+  variant: EvolvableVariant<unknown>
+  scenarioId: string
+  rep: number
+  split: RunSplitTag
+  seed: number
+  trial: MultiShotTrialResult
+  target: string
+}): RunRecord {
+  const scoreKey = input.split === 'holdout' ? 'holdoutScore' : 'searchScore'
+  return {
+    runId: input.runId,
+    experimentId: input.target,
+    candidateId: input.variant.id,
+    seed: input.seed,
+    model: 'production-loop@synthetic',
+    promptHash: '0'.repeat(64),
+    configHash: '0'.repeat(64),
+    commitSha: '0'.repeat(40),
+    wallMs: input.trial.durationMs ?? 1,
+    costUsd: input.trial.cost ?? 0,
+    tokenUsage: { input: 0, output: 0 },
+    outcome: {
+      [scoreKey]: input.trial.score,
+      raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 },
+    },
+    splitTag: input.split,
+    scenarioId: input.scenarioId,
+  }
+}
+
+function toPromptString(payload: unknown): string {
+  if (typeof payload === 'string') return payload
+  if (payload == null) return ''
+  try {
+    return JSON.stringify(payload, null, 2)
+  } catch {
+    return String(payload)
+  }
+}
+
+function defaultRenderBody(ctx: ProductionLoopRenderContext): string {
+  const cluster = ctx.actedOnCluster
+  const release = ctx.release
+  const gate = ctx.gate
+  const lines: string[] = []
+  lines.push(`## Production-loop prompt update — \`${ctx.target}\``)
+  lines.push('')
+  lines.push(`Run id: \`${ctx.runId}\``)
+  lines.push(`Decision: \`${ctx.decision}\``)
+  lines.push(
+    `Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`,
+  )
+  lines.push('')
+  if (cluster) {
+    lines.push('### Triggering failure cluster')
+    lines.push('')
+    lines.push(`- **class**: \`${cluster.failureClass}\``)
+    lines.push(`- **runs in cluster**: ${cluster.runCount}`)
+    lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`)
+    if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``)
+    if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``)
+    if (cluster.exampleError) {
+      lines.push(
+        `- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, ' ')}\``,
+      )
+    }
+    lines.push('')
+  }
+  if (gate) {
+    lines.push('### Held-out promotion gate')
+    lines.push('')
+    lines.push(`- **decision**: \`${gate.promote ? 'PROMOTE' : 'REJECT'}\``)
+    lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`)
+    lines.push(
+      `- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`,
+    )
+    lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`)
+    lines.push(
+      `- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`,
+    )
+    lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`)
+    lines.push('')
+  }
+  if (release) {
+    lines.push('### Release confidence')
+    lines.push('')
+    lines.push(`- **status**: \`${release.status}\``)
+    lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`)
+    lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`)
+    if (release.issues.length > 0) {
+      lines.push('- **issues**:')
+      for (const issue of release.issues) {
+        lines.push(`  - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`)
+      }
+    }
+    lines.push('')
+  }
+  lines.push('### Prompt diff')
+  lines.push('')
+  lines.push('```diff')
+  lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString))
+  lines.push('```')
+  return lines.join('\n')
+}
+
+function unifiedDiff(a: string, b: string): string {
+  const aLines = a.split('\n')
+  const bLines = b.split('\n')
+  const out: string[] = []
+  const max = Math.max(aLines.length, bLines.length)
+  for (let i = 0; i < max; i++) {
+    const al = aLines[i]
+    const bl = bLines[i]
+    if (al === bl) continue
+    if (al !== undefined) out.push(`- ${al}`)
+    if (bl !== undefined) out.push(`+ ${bl}`)
+  }
+  return out.join('\n')
+}
diff --git a/src/wire/handlers.ts b/src/wire/handlers.ts
index 7d7911e..d2cae9f 100644
--- a/src/wire/handlers.ts
+++ b/src/wire/handlers.ts
@@ -9,16 +9,23 @@
  *   - Throws `WireError` for caller-fixable errors (404, 400, 422).
  *   - Lets unexpected errors bubble — the transport maps them to 500.
  */
+import type { FeedbackTrajectoryStore } from '../feedback-trajectory'
 import { callLlmJson } from '../llm-client'
+import type { TraceEvent as InternalTraceEvent } from '../trace/schema'
+import type { TraceStore } from '../trace/store'
 import { getBuiltinRubric, listBuiltinRubrics } from './rubrics'
 import {
+  type FeedbackIngestResponse,
   hashRubric,
   type JudgeRequest,
   type JudgeResult,
   type ListRubricsResponse,
   type Rubric,
+  type TracesIngestRequest,
+  type TracesIngestResponse,
   type VersionResponse,
   WIRE_VERSION,
+  type FeedbackTrajectory as WireFeedbackTrajectory,
 } from './schemas'
 
 /** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */
@@ -260,6 +267,82 @@ export function handleVersion(): VersionResponse {
     package: '@tangle-network/agent-eval',
     version: readPackageVersion(),
     wireVersion: WIRE_VERSION,
-    apiSurface: ['judge', 'listRubrics', 'version'],
+    apiSurface: ['judge', 'listRubrics', 'version', 'feedback.ingest', 'traces.ingest'],
   }
 }
+
+// ── Ingestion handlers (0.25.0) ─────────────────────────────────────
+
+/**
+ * Pluggable stores the wire layer routes ingestion writes into. Both
+ * are optional — when omitted, the corresponding endpoint returns 503.
+ *
+ * Production deployments wire a `FileSystemTraceStore` and
+ * `FileSystemFeedbackTrajectoryStore` here. Tests substitute in-memory
+ * stores.
+ */
+export interface IngestionStores {
+  traceStore?: TraceStore
+  feedbackStore?: FeedbackTrajectoryStore
+}
+
+/**
+ * `POST /v1/traces/ingest` — accept a batch of `TraceEvent`s from the
+ * production runtime. Best-effort: each event is appended independently;
+ * one bad event does not poison the batch.
+ *
+ * Idempotency: the underlying store is append-only; consumers retrying
+ * the same payload will get duplicate events. Consumers should
+ * de-duplicate by `eventId` downstream — production traces frequently
+ * land via at-least-once buses (Kafka, SQS) where dedup is unavoidable.
+ */
+export async function handleTracesIngest(
+  req: TracesIngestRequest,
+  stores: IngestionStores,
+): Promise<TracesIngestResponse> {
+  if (!stores.traceStore) {
+    throw new WireError(
+      'service_unavailable',
+      'No trace store configured on this server. Pass `traceStore` to `createApp`.',
+      503,
+    )
+  }
+  const errors: Array<{ eventId: string; message: string }> = []
+  let accepted = 0
+  for (const event of req.events) {
+    try {
+      // The wire `TraceEvent` is structurally identical to the internal one.
+      await stores.traceStore.appendEvent(event as InternalTraceEvent)
+      accepted++
+    } catch (err) {
+      errors.push({
+        eventId: event.eventId,
+        message: err instanceof Error ? err.message : String(err),
+      })
+    }
+  }
+  return { accepted, rejected: errors.length, errors }
+}
+
+/**
+ * `POST /v1/feedback` — accept a single `FeedbackTrajectory` from the
+ * production runtime. Idempotent on `id`: re-posting the same trajectory
+ * replaces the prior record.
+ */
+export async function handleFeedbackIngest(
+  req: WireFeedbackTrajectory,
+  stores: IngestionStores,
+): Promise<FeedbackIngestResponse> {
+  if (!stores.feedbackStore) {
+    throw new WireError(
+      'service_unavailable',
+      'No feedback store configured on this server. Pass `feedbackStore` to `createApp`.',
+      503,
+    )
+  }
+  // The wire `FeedbackTrajectory` aligns 1:1 with the internal type;
+  // cast through `unknown` since the wire schema is a Zod-inferred
+  // structural type with optional fields the internal store consumes.
+  await stores.feedbackStore.save(req as unknown as Parameters<FeedbackTrajectoryStore['save']>[0])
+  return { id: req.id, persisted: true }
+}
diff --git a/src/wire/openapi.ts b/src/wire/openapi.ts
index 1aaf6d3..8603d59 100644
--- a/src/wire/openapi.ts
+++ b/src/wire/openapi.ts
@@ -14,10 +14,14 @@ import type { OpenAPIObject } from 'openapi3-ts/oas31'
 
 import {
   ErrorResponseSchema,
+  FeedbackIngestResponseSchema,
+  FeedbackTrajectorySchema,
   HealthResponseSchema,
   JudgeRequestSchema,
   JudgeResultSchema,
   ListRubricsResponseSchema,
+  TracesIngestRequestSchema,
+  TracesIngestResponseSchema,
   VersionResponseSchema,
   WIRE_VERSION,
 } from './schemas'
@@ -32,6 +36,10 @@ export function buildOpenApi(packageVersion: string): OpenAPIObject {
   registry.register('VersionResponse', VersionResponseSchema)
   registry.register('HealthResponse', HealthResponseSchema)
   registry.register('ErrorResponse', ErrorResponseSchema)
+  registry.register('TracesIngestRequest', TracesIngestRequestSchema)
+  registry.register('TracesIngestResponse', TracesIngestResponseSchema)
+  registry.register('FeedbackTrajectory', FeedbackTrajectorySchema)
+  registry.register('FeedbackIngestResponse', FeedbackIngestResponseSchema)
 
   // Routes
   registry.registerPath({
@@ -106,6 +114,73 @@ export function buildOpenApi(packageVersion: string): OpenAPIObject {
     },
   })
 
+  registry.registerPath({
+    method: 'post',
+    path: '/v1/traces/ingest',
+    summary: 'Ingest a batch of production TraceEvents',
+    description:
+      'Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.',
+    request: {
+      body: {
+        content: {
+          'application/json': { schema: TracesIngestRequestSchema },
+          'application/x-ndjson': { schema: TracesIngestRequestSchema },
+        },
+      },
+    },
+    responses: {
+      200: {
+        description: 'Ingestion summary',
+        content: { 'application/json': { schema: TracesIngestResponseSchema } },
+      },
+      400: {
+        description: 'Validation error',
+        content: { 'application/json': { schema: ErrorResponseSchema } },
+      },
+      401: {
+        description: 'Unauthorized (when bearer auth is configured)',
+        content: { 'application/json': { schema: ErrorResponseSchema } },
+      },
+      503: {
+        description: 'No trace store configured',
+        content: { 'application/json': { schema: ErrorResponseSchema } },
+      },
+    },
+  })
+
+  registry.registerPath({
+    method: 'post',
+    path: '/v1/feedback',
+    summary: 'Ingest a FeedbackTrajectory from production',
+    description:
+      'Persist a single FeedbackTrajectory. Idempotent on trajectory.id — re-posting replaces the prior record. Used by production runtimes to forward user 👍/👎/edits into the eval substrate.',
+    request: {
+      body: {
+        content: {
+          'application/json': { schema: FeedbackTrajectorySchema },
+        },
+      },
+    },
+    responses: {
+      200: {
+        description: 'Persisted',
+        content: { 'application/json': { schema: FeedbackIngestResponseSchema } },
+      },
+      400: {
+        description: 'Validation error',
+        content: { 'application/json': { schema: ErrorResponseSchema } },
+      },
+      401: {
+        description: 'Unauthorized (when bearer auth is configured)',
+        content: { 'application/json': { schema: ErrorResponseSchema } },
+      },
+      503: {
+        description: 'No feedback store configured',
+        content: { 'application/json': { schema: ErrorResponseSchema } },
+      },
+    },
+  })
+
   const generator = new OpenApiGeneratorV31(registry.definitions)
   const doc = generator.generateDocument({
     openapi: '3.1.0',
diff --git a/src/wire/schemas.ts b/src/wire/schemas.ts
index fd3e553..1a8c542 100644
--- a/src/wire/schemas.ts
+++ b/src/wire/schemas.ts
@@ -181,6 +181,171 @@ export const HealthResponseSchema = z
   })
   .openapi('HealthResponse')
 
+// ── Ingestion: production traces + user feedback (0.25.0) ───────────
+
+/**
+ * Minimal `TraceEvent` shape that the production runtime emits.
+ * Matches `trace/schema.ts` `TraceEvent` but is duplicated here as a
+ * wire schema so non-TypeScript clients can validate without depending
+ * on internal types.
+ */
+export const TraceEventSchema = z
+  .object({
+    eventId: z.string().min(1).describe('Stable id for the event. Use ULID or UUID.'),
+    runId: z.string().min(1).describe('Run this event belongs to.'),
+    spanId: z.string().optional().describe('Span that emitted the event, if any.'),
+    kind: z
+      .enum([
+        'log',
+        'error',
+        'budget_decrement',
+        'budget_breach',
+        'state_mutation',
+        'policy_violation',
+        'redaction_applied',
+        'custom',
+      ])
+      .describe('Coarse event category — matches the TraceSchema v1 EventKind enum.'),
+    timestamp: z
+      .number()
+      .int()
+      .nonnegative()
+      .describe('Unix millis. Must be monotonically non-decreasing within a span.'),
+    payload: z
+      .record(z.string(), z.unknown())
+      .describe('Free-form payload — the runtime owns the shape.'),
+  })
+  .openapi('TraceEvent')
+
+export const TracesIngestRequestSchema = z
+  .object({
+    events: z
+      .array(TraceEventSchema)
+      .min(1)
+      .max(10_000)
+      .describe('Batch of events. Max 10k per call — bigger streams should be chunked.'),
+  })
+  .openapi('TracesIngestRequest')
+
+export const TracesIngestResponseSchema = z
+  .object({
+    accepted: z.number().int().nonnegative().describe('Number of events persisted.'),
+    rejected: z
+      .number()
+      .int()
+      .nonnegative()
+      .describe('Number of events the store refused — see `errors[]` for reasons.'),
+    errors: z
+      .array(
+        z.object({
+          eventId: z.string().describe('Event id this error applies to.'),
+          message: z.string().describe('Why the event was rejected.'),
+        }),
+      )
+      .default([]),
+  })
+  .openapi('TracesIngestResponse')
+
+export const FeedbackLabelSchema = z
+  .object({
+    id: z.string().optional(),
+    source: z.enum(['user', 'judge', 'environment', 'metric', 'policy', 'system']),
+    kind: z.enum([
+      'approve',
+      'reject',
+      'select',
+      'edit',
+      'rank',
+      'rate',
+      'comment',
+      'metric_outcome',
+      'policy_block',
+      'revision_request',
+    ]),
+    value: z.unknown(),
+    reason: z.string().optional(),
+    severity: z.enum(['info', 'warning', 'error', 'critical']).optional(),
+    createdAt: z.string().describe('ISO-8601 UTC.'),
+    metadata: z.record(z.string(), z.unknown()).optional(),
+  })
+  .openapi('FeedbackLabel')
+
+export const FeedbackAttemptSchema = z
+  .object({
+    id: z.string().min(1),
+    stepIndex: z.number().int().nonnegative(),
+    artifactType: z.enum([
+      'text',
+      'code',
+      'plan',
+      'research',
+      'action',
+      'ui',
+      'decision',
+      'data',
+      'other',
+    ]),
+    artifact: z.unknown(),
+    options: z.array(z.unknown()).optional(),
+    proposedAction: z
+      .object({
+        type: z.string(),
+        risk: z.enum(['low', 'medium', 'high']).optional(),
+        costUsd: z.number().optional(),
+        externalSideEffect: z.boolean().optional(),
+        requiresApproval: z.boolean().optional(),
+        metadata: z.record(z.string(), z.unknown()).optional(),
+      })
+      .optional(),
+    feedback: z.array(FeedbackLabelSchema).optional(),
+    createdAt: z.string(),
+    metadata: z.record(z.string(), z.unknown()).optional(),
+  })
+  .openapi('FeedbackAttempt')
+
+export const FeedbackTrajectorySchema = z
+  .object({
+    id: z.string().min(1).describe('Stable id; idempotency key for the trajectory.'),
+    projectId: z.string().optional(),
+    scenarioId: z.string().optional(),
+    task: z.object({
+      intent: z.string().min(1),
+      context: z.unknown().optional(),
+    }),
+    attempts: z.array(FeedbackAttemptSchema).default([]),
+    labels: z.array(FeedbackLabelSchema).default([]),
+    outcome: z
+      .object({
+        success: z.boolean().optional(),
+        score: z.number().optional(),
+        metrics: z.record(z.string(), z.number()).optional(),
+        costUsd: z.number().optional(),
+        detail: z.string().optional(),
+        observedAt: z.string().optional(),
+        metadata: z.record(z.string(), z.unknown()).optional(),
+      })
+      .optional(),
+    split: z.enum(['train', 'dev', 'test', 'holdout']).optional(),
+    tags: z.record(z.string(), z.string()).optional(),
+    createdAt: z.string().describe('ISO-8601 UTC.'),
+    updatedAt: z.string().optional(),
+    metadata: z.record(z.string(), z.unknown()).optional(),
+  })
+  .openapi('FeedbackTrajectory')
+
+export const FeedbackIngestResponseSchema = z
+  .object({
+    id: z.string().describe('Trajectory id that was persisted.'),
+    persisted: z.boolean().describe('True when the trajectory was saved (idempotent on id).'),
+  })
+  .openapi('FeedbackIngestResponse')
+
+export type TraceEvent = z.infer<typeof TraceEventSchema>
+export type TracesIngestRequest = z.infer<typeof TracesIngestRequestSchema>
+export type TracesIngestResponse = z.infer<typeof TracesIngestResponseSchema>
+export type FeedbackTrajectory = z.infer<typeof FeedbackTrajectorySchema>
+export type FeedbackIngestResponse = z.infer<typeof FeedbackIngestResponseSchema>
+
 // ── Errors ──────────────────────────────────────────────────────────
 
 export const ErrorResponseSchema = z
diff --git a/src/wire/server.ts b/src/wire/server.ts
index e531348..7efdcc5 100644
--- a/src/wire/server.ts
+++ b/src/wire/server.ts
@@ -6,29 +6,81 @@
  *   2. Calls the matching handler in `handlers.ts`.
  *   3. Renders 4xx for `WireError` with structured body, 500 for unexpected.
  *
- * The server has no internal state besides the handler imports — restart
- * costs nothing. Run via `agent-eval serve --port 5005`.
+ * The server holds optional `IngestionStores` (passed to `createApp`)
+ * to receive production traces and user feedback. With no stores wired,
+ * the ingestion endpoints return 503 — read endpoints (`/v1/judge`,
+ * `/v1/rubrics`, `/v1/version`) remain fully functional.
+ *
+ * Run via `agent-eval serve --port 5005`.
  */
 import { type ServerType, serve } from '@hono/node-server'
 import { Hono } from 'hono'
 import { cors } from 'hono/cors'
 
-import { handleJudge, handleListRubrics, handleVersion, WireError } from './handlers'
+import {
+  handleFeedbackIngest,
+  handleJudge,
+  handleListRubrics,
+  handleTracesIngest,
+  handleVersion,
+  type IngestionStores,
+  WireError,
+} from './handlers'
 import { buildOpenApi } from './openapi'
-import { JudgeRequestSchema } from './schemas'
+import { FeedbackTrajectorySchema, JudgeRequestSchema, TracesIngestRequestSchema } from './schemas'
 
 const STARTED_AT = Date.now()
 
-export function createApp() {
+export interface CreateAppOptions {
+  /** Stores wired to the ingestion endpoints. */
+  stores?: IngestionStores
+  /**
+   * Bearer-token auth. When provided, every endpoint EXCEPT `/healthz`
+   * and `/v1/version` requires `Authorization: Bearer <token>`. The
+   * token may be a static string OR a function for time-bounded /
+   * rotating tokens.
+   *
+   * Recommended for any server that accepts ingestion writes from the
+   * public internet. Read-only deployments may omit it.
+   */
+  auth?: {
+    bearer: string | ((token: string) => boolean | Promise<boolean>)
+  }
+}
+
+const AUTH_EXEMPT_PATHS = new Set(['/healthz', '/v1/version', '/openapi.json'])
+
+export function createApp(opts: CreateAppOptions = {}) {
   const app = new Hono()
 
   app.use('*', cors())
 
+  // Bearer-token middleware (only attached when configured).
+  if (opts.auth) {
+    const verify = opts.auth.bearer
+    app.use('*', async (c, next) => {
+      const path = new URL(c.req.url).pathname
+      if (AUTH_EXEMPT_PATHS.has(path)) return next()
+      const raw = c.req.header('authorization') ?? ''
+      const match = raw.match(/^Bearer\s+(.+)$/i)
+      if (!match) {
+        throw new WireError('unauthorized', 'Missing or malformed Authorization header.', 401)
+      }
+      const token = match[1] as string
+      const ok = typeof verify === 'string' ? token === verify : await verify(token)
+      if (!ok) {
+        throw new WireError('unauthorized', 'Invalid bearer token.', 401)
+      }
+      return next()
+    })
+  }
+
   app.onError((err, c) => {
     if (err instanceof WireError) {
+      const status = err.status as 400 | 401 | 404 | 422 | 500 | 503
       return c.json(
         { error: { code: err.code, message: err.message, details: err.details } },
-        err.status as 400 | 404 | 422 | 500,
+        status,
       )
     }
     // Unexpected — log and return generic 500 without leaking internals.
@@ -66,13 +118,74 @@ export function createApp() {
     return c.json(result)
   })
 
+  // ── Traces ingest (NDJSON-friendly: accepts either {events:[...]} or NDJSON) ──
+  app.post('/v1/traces/ingest', async (c) => {
+    const contentType = c.req.header('content-type') ?? ''
+    let payload: unknown
+    if (contentType.includes('application/x-ndjson')) {
+      const text = await c.req.text()
+      const events = text
+        .split('\n')
+        .map((line) => line.trim())
+        .filter((line) => line.length > 0)
+        .map((line) => {
+          try {
+            return JSON.parse(line)
+          } catch {
+            throw new WireError(
+              'validation_error',
+              'NDJSON line did not parse as JSON.',
+              400,
+              line.slice(0, 200),
+            )
+          }
+        })
+      payload = { events }
+    } else {
+      payload = await c.req.json().catch(() => null)
+    }
+    if (payload == null) {
+      throw new WireError('validation_error', 'Request body must be JSON or NDJSON.', 400)
+    }
+    const parsed = TracesIngestRequestSchema.safeParse(payload)
+    if (!parsed.success) {
+      throw new WireError(
+        'validation_error',
+        'Request did not match TracesIngestRequest schema.',
+        400,
+        parsed.error.issues,
+      )
+    }
+    const result = await handleTracesIngest(parsed.data, opts.stores ?? {})
+    return c.json(result)
+  })
+
+  // ── Feedback ingest ──
+  app.post('/v1/feedback', async (c) => {
+    const raw = await c.req.json().catch(() => null)
+    if (raw == null) {
+      throw new WireError('validation_error', 'Request body must be JSON.', 400)
+    }
+    const parsed = FeedbackTrajectorySchema.safeParse(raw)
+    if (!parsed.success) {
+      throw new WireError(
+        'validation_error',
+        'Request did not match FeedbackTrajectory schema.',
+        400,
+        parsed.error.issues,
+      )
+    }
+    const result = await handleFeedbackIngest(parsed.data, opts.stores ?? {})
+    return c.json(result)
+  })
+
   // ── OpenAPI spec ──
   app.get('/openapi.json', (c) => c.json(buildOpenApi(handleVersion().version)))
 
   return app
 }
 
-export interface ServeOptions {
+export interface ServeOptions extends CreateAppOptions {
   /** Default 5005. */
   port?: number
   /** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */
@@ -80,7 +193,7 @@ export interface ServeOptions {
 }
 
 export function startServer(opts: ServeOptions = {}): ServerType {
-  const app = createApp()
+  const app = createApp(opts)
   const port = opts.port ?? 5005
   const host = opts.host ?? '127.0.0.1'
   return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
diff --git a/tests/auto-pr.test.ts b/tests/auto-pr.test.ts
new file mode 100644
index 0000000..85b59ff
--- /dev/null
+++ b/tests/auto-pr.test.ts
@@ -0,0 +1,290 @@
+/**
+ * Auto-PR tests.
+ *
+ * Regression coverage:
+ *   - input validation (bad branch names, '..' paths, duplicate paths)
+ *   - HTTP client opens a PR via the documented REST sequence (blob →
+ *     tree → commit → ref → pulls)
+ *   - HTTP client is idempotent: existing open PR is returned instead
+ *     of a duplicate create
+ *   - HTTP client fast-forwards the ref when it already exists with a
+ *     different SHA
+ *   - dryRun does not call fetch/exec
+ *
+ * No network.
+ */
+import { describe, expect, it, vi } from 'vitest'
+
+import {
+  ghCliClient,
+  httpGithubClient,
+  proposeAutomatedPullRequest,
+} from '../src/auto-pr'
+import { ValidationError } from '../src/errors'
+
+function jsonResponse(body: unknown, status = 200): Response {
+  return new Response(JSON.stringify(body), {
+    status,
+    headers: { 'content-type': 'application/json' },
+  })
+}
+
+describe('proposeAutomatedPullRequest validation', () => {
+  const fixture = {
+    repo: { owner: 'tangle-network', name: 'tax-agent' },
+    branchName: 'eval/auto-improve/r1',
+    fileChanges: [{ path: 'prompts/system.txt', contents: 'new' }],
+    title: 'feat: production-loop',
+    body: 'body',
+  }
+
+  it('rejects an empty repo owner', async () => {
+    const client = { proposeChange: vi.fn() }
+    await expect(
+      proposeAutomatedPullRequest(client, { ...fixture, repo: { owner: '', name: 'x' } }),
+    ).rejects.toBeInstanceOf(ValidationError)
+    expect(client.proposeChange).not.toHaveBeenCalled()
+  })
+
+  it('rejects whitespace branch names', async () => {
+    const client = { proposeChange: vi.fn() }
+    await expect(
+      proposeAutomatedPullRequest(client, { ...fixture, branchName: 'has space' }),
+    ).rejects.toBeInstanceOf(ValidationError)
+  })
+
+  it('rejects path traversal', async () => {
+    const client = { proposeChange: vi.fn() }
+    await expect(
+      proposeAutomatedPullRequest(client, {
+        ...fixture,
+        fileChanges: [{ path: '../etc/passwd', contents: '' }],
+      }),
+    ).rejects.toBeInstanceOf(ValidationError)
+  })
+
+  it('rejects duplicate paths', async () => {
+    const client = { proposeChange: vi.fn() }
+    await expect(
+      proposeAutomatedPullRequest(client, {
+        ...fixture,
+        fileChanges: [
+          { path: 'a.txt', contents: '1' },
+          { path: 'a.txt', contents: '2' },
+        ],
+      }),
+    ).rejects.toBeInstanceOf(ValidationError)
+  })
+
+  it('rejects branch name equal to base branch', async () => {
+    const client = { proposeChange: vi.fn() }
+    await expect(
+      proposeAutomatedPullRequest(client, { ...fixture, branchName: 'main', baseBranch: 'main' }),
+    ).rejects.toBeInstanceOf(ValidationError)
+  })
+
+  it('empty title is rejected', async () => {
+    const client = { proposeChange: vi.fn() }
+    await expect(
+      proposeAutomatedPullRequest(client, { ...fixture, title: '   ' }),
+    ).rejects.toBeInstanceOf(ValidationError)
+  })
+
+  it('passes through to client.proposeChange on valid input', async () => {
+    const client = {
+      proposeChange: vi
+        .fn()
+        .mockResolvedValue({ prUrl: 'u', branchName: fixture.branchName, headSha: 's', dryRun: false }),
+    }
+    const result = await proposeAutomatedPullRequest(client, fixture)
+    expect(result.prUrl).toBe('u')
+    expect(client.proposeChange).toHaveBeenCalledTimes(1)
+  })
+})
+
+describe('httpGithubClient', () => {
+  function fakeFetch(handler: (url: string, init: RequestInit) => Response): typeof fetch {
+    return ((url: RequestInfo | URL, init?: RequestInit) =>
+      Promise.resolve(handler(String(url), init ?? {}))) as typeof fetch
+  }
+
+  function transcript() {
+    const requests: Array<{ method: string; url: string; body: unknown }> = []
+    return {
+      requests,
+      record(url: string, init: RequestInit) {
+        requests.push({
+          method: init.method ?? 'GET',
+          url,
+          body: init.body ? JSON.parse(init.body as string) : null,
+        })
+      },
+    }
+  }
+
+  it('opens a PR via blob → tree → commit → ref → pulls', async () => {
+    const t = transcript()
+    const client = httpGithubClient({
+      token: 'test-token',
+      fetchImpl: fakeFetch((url, init) => {
+        t.record(url, init)
+        // Base ref
+        if (url.endsWith('/git/ref/heads/main')) {
+          return jsonResponse({ ref: 'refs/heads/main', object: { sha: 'base-sha' } })
+        }
+        // Base commit (for tree.sha)
+        if (url.endsWith('/git/commits/base-sha')) {
+          return jsonResponse({ sha: 'base-sha', tree: { sha: 'base-tree-sha' } })
+        }
+        // Create blob
+        if (url.endsWith('/git/blobs') && init.method === 'POST') {
+          return jsonResponse({ sha: 'blob-sha-1' })
+        }
+        // Create tree
+        if (url.endsWith('/git/trees') && init.method === 'POST') {
+          return jsonResponse({ sha: 'tree-sha-1' })
+        }
+        // Create commit
+        if (url.endsWith('/git/commits') && init.method === 'POST') {
+          return jsonResponse({ sha: 'commit-sha-1', tree: { sha: 'tree-sha-1' } })
+        }
+        // Branch ref existence (404 -> not present)
+        if (url.includes('/git/ref/heads/eval')) {
+          return jsonResponse({ message: 'Not Found' }, 404)
+        }
+        // Create ref
+        if (url.endsWith('/git/refs') && init.method === 'POST') {
+          return jsonResponse({ ref: 'refs/heads/eval/r1', object: { sha: 'commit-sha-1' } })
+        }
+        // List PRs (empty -> nothing exists)
+        if (url.includes('/pulls?')) {
+          return jsonResponse([])
+        }
+        // Create PR
+        if (url.endsWith('/pulls') && init.method === 'POST') {
+          return jsonResponse({
+            html_url: 'https://github.com/o/r/pull/1',
+            number: 1,
+          })
+        }
+        // Reviewers / labels (best-effort, accept any)
+        return jsonResponse({}, 200)
+      }),
+      now: () => new Date('2026-01-01T00:00:00Z'),
+    })
+
+    const result = await client.proposeChange({
+      repo: { owner: 'o', name: 'r' },
+      branchName: 'eval/r1',
+      fileChanges: [{ path: 'a.txt', contents: 'hello' }],
+      title: 'T',
+      body: 'B',
+    })
+
+    expect(result.prUrl).toBe('https://github.com/o/r/pull/1')
+    expect(result.headSha).toBe('commit-sha-1')
+    expect(result.dryRun).toBe(false)
+
+    // Verify documented REST sequence in order.
+    const ordered = t.requests.map((r) => `${r.method} ${r.url.replace(/\?.*$/, '')}`)
+    expect(ordered).toContain('GET https://api.github.com/repos/o/r/git/ref/heads/main')
+    expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/blobs')
+    expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/trees')
+    expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/commits')
+    expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/refs')
+    expect(ordered).toContain('POST https://api.github.com/repos/o/r/pulls')
+  })
+
+  it('returns the existing open PR instead of opening a duplicate (idempotency)', async () => {
+    const client = httpGithubClient({
+      token: 'tok',
+      fetchImpl: fakeFetch((url, init) => {
+        if (url.endsWith('/git/ref/heads/main')) {
+          return jsonResponse({ ref: 'refs/heads/main', object: { sha: 'base-sha' } })
+        }
+        if (url.endsWith('/git/commits/base-sha')) {
+          return jsonResponse({ sha: 'base-sha', tree: { sha: 'base-tree' } })
+        }
+        if (url.endsWith('/git/blobs')) return jsonResponse({ sha: 'b1' })
+        if (url.endsWith('/git/trees')) return jsonResponse({ sha: 't1' })
+        if (url.endsWith('/git/commits') && init.method === 'POST') {
+          return jsonResponse({ sha: 'c1', tree: { sha: 't1' } })
+        }
+        // Branch already exists at same sha.
+        if (url.endsWith('/git/ref/heads/eval/r1')) {
+          return jsonResponse({ ref: 'refs/heads/eval/r1', object: { sha: 'c1' } })
+        }
+        if (url.includes('/pulls?')) {
+          return jsonResponse([{ html_url: 'https://github.com/o/r/pull/77', number: 77 }])
+        }
+        return jsonResponse({}, 200)
+      }),
+    })
+
+    const result = await client.proposeChange({
+      repo: { owner: 'o', name: 'r' },
+      branchName: 'eval/r1',
+      fileChanges: [{ path: 'a.txt', contents: 'x' }],
+      title: 'T',
+      body: 'B',
+    })
+
+    expect(result.prUrl).toBe('https://github.com/o/r/pull/77')
+  })
+
+  it('dryRun does not call fetch', async () => {
+    const fetchSpy = vi.fn()
+    const client = httpGithubClient({
+      token: 'tok',
+      fetchImpl: fetchSpy as unknown as typeof fetch,
+    })
+    const r = await client.proposeChange({
+      repo: { owner: 'o', name: 'r' },
+      branchName: 'eval/r1',
+      fileChanges: [{ path: 'a.txt', contents: 'x' }],
+      title: 'T',
+      body: 'B',
+      dryRun: true,
+    })
+    expect(r.dryRun).toBe(true)
+    expect(fetchSpy).not.toHaveBeenCalled()
+  })
+
+  it('surfaces GitHub API failures with status code and body excerpt', async () => {
+    const client = httpGithubClient({
+      token: 'tok',
+      fetchImpl: () =>
+        Promise.resolve(
+          new Response('rate limit exceeded', { status: 403, headers: { 'content-type': 'text/plain' } }),
+        ),
+    })
+
+    await expect(
+      client.proposeChange({
+        repo: { owner: 'o', name: 'r' },
+        branchName: 'eval/r1',
+        fileChanges: [{ path: 'a.txt', contents: 'x' }],
+        title: 'T',
+        body: 'B',
+      }),
+    ).rejects.toThrow(/403/)
+  })
+})
+
+describe('ghCliClient', () => {
+  it('dryRun returns a synthetic compare URL without exec calls', async () => {
+    const execSpy = vi.fn()
+    const client = ghCliClient({ exec: execSpy as unknown as never })
+    const r = await client.proposeChange({
+      repo: { owner: 'o', name: 'r' },
+      branchName: 'eval/r1',
+      fileChanges: [{ path: 'a.txt', contents: 'x' }],
+      title: 'T',
+      body: 'B',
+      dryRun: true,
+    })
+    expect(r.dryRun).toBe(true)
+    expect(r.prUrl).toContain('compare/main...eval/r1')
+    expect(execSpy).not.toHaveBeenCalled()
+  })
+})
diff --git a/tests/production-loop-example.test.ts b/tests/production-loop-example.test.ts
new file mode 100644
index 0000000..51e4179
--- /dev/null
+++ b/tests/production-loop-example.test.ts
@@ -0,0 +1,186 @@
+/**
+ * Worked-example regression: runs the same shape as
+ * `examples/production-loop/index.ts` against the in-memory stores and
+ * verifies a PR-shaped output. If the example breaks, this test fails
+ * — which means CI catches example rot without spawning a tsx process.
+ */
+import { describe, expect, it } from 'vitest'
+
+import type {
+  AutoPrClient,
+  ProposeAutomatedPullRequestInput,
+  ProposeAutomatedPullRequestResult,
+} from '../src/auto-pr'
+import { InMemoryFeedbackTrajectoryStore } from '../src/feedback-trajectory'
+import { runProductionLoop } from '../src/production-loop'
+import { InMemoryTraceStore } from '../src/trace/store'
+import type { Scenario } from '../src/types'
+
+interface TaxAgentPayload {
+  systemPrompt: string
+}
+
+function scenario(id: string, persona: string): Scenario {
+  return {
+    id,
+    persona,
+    label: id,
+    thesis: `Filing scenario: ${id}`,
+    dimensions: ['correctness'],
+    turns: [{ user: 'Help me file my taxes', expectedBehaviors: ['cite a statute'] }],
+    artifactChecks: [],
+  }
+}
+
+describe('worked example: production-loop demo end-to-end', () => {
+  it('fires the loop on a synthetic prod failure cluster and produces a PR-shaped output', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    // Seed 8 prod failures in the same cluster.
+    for (let i = 0; i < 8; i++) {
+      await traceStore.appendRun({
+        runId: `prod-run-${i}`,
+        scenarioId: 'ftc-noncompete-edge',
+        startedAt: Date.now() - 3_600_000 + i * 10_000,
+        endedAt: Date.now() - 3_600_000 + i * 10_000 + 500,
+        status: 'failed',
+        outcome: { pass: false, score: 0.2, failureClass: 'instruction_following' },
+      })
+      await feedbackStore.save({
+        id: `ft-${i}`,
+        scenarioId: 'ftc-noncompete-edge',
+        task: { intent: 'FTC question' },
+        attempts: [],
+        labels: [
+          {
+            source: 'user',
+            kind: 'reject',
+            value: { thumb: 'down' },
+            severity: 'error',
+            createdAt: new Date().toISOString(),
+          },
+        ],
+        createdAt: new Date().toISOString(),
+      })
+    }
+
+    const captured: ProposeAutomatedPullRequestInput[] = []
+    const prClient: AutoPrClient = {
+      proposeChange(input): Promise<ProposeAutomatedPullRequestResult> {
+        captured.push(input)
+        return Promise.resolve({
+          prUrl: `https://github.com/o/r/pull/synthetic-1`,
+          branchName: input.branchName,
+          headSha: 'cafe1234'.padEnd(40, '0'),
+          dryRun: false,
+        })
+      },
+    }
+
+    const baselinePrompt =
+      'You are a tax assistant. Be helpful and concise. ' +
+      'Answer questions about US tax forms and rules.'
+
+    const result = await runProductionLoop<TaxAgentPayload>({
+      runId: 'worked-example-prod-loop',
+      target: 'tax-agent',
+      traceStore,
+      feedbackStore,
+      cluster: { minClusterSize: 5, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
+      evolve: {
+        baselinePrompt: { systemPrompt: baselinePrompt },
+        holdoutScenarios: [
+          scenario('ftc-noncompete-edge', 'small-biz-owner'),
+          scenario('schedule-c-self-employed', 'freelancer'),
+          scenario('w2-multistate', 'remote-worker'),
+        ],
+        searchScenarios: [scenario('basic-w2', 'salaried'), scenario('joint-return', 'married')],
+        runner: {
+          run: ({ variant, scenarioId }) => ({
+            trace: {
+              scenarioId,
+              turns: [
+                { role: 'user', content: 'help' },
+                { role: 'assistant', content: variant.payload.systemPrompt },
+              ],
+              transcript: variant.payload.systemPrompt,
+            },
+            costUsd: 0.01,
+            durationMs: 5,
+          }),
+        },
+        scorer: {
+          score: ({ variant }) => {
+            const lengthSignal = Math.min(1, variant.payload.systemPrompt.length / 300)
+            const citationBonus = /cite/i.test(variant.payload.systemPrompt) ? 0.3 : 0
+            const refuseBonus = /refuse/i.test(variant.payload.systemPrompt) ? 0.15 : 0
+            const score = Math.min(1, 0.3 + lengthSignal * 0.4 + citationBonus + refuseBonus)
+            return { score, ok: score >= 0.6 }
+          },
+        },
+        mutator: {
+          mutate: async ({ parent, childCount, generation }) =>
+            Array.from({ length: childCount }, (_, i) => ({
+              id: `${parent.id}-cite-${generation}-${i}`,
+              label: 'cite-required',
+              generation,
+              parentId: parent.id,
+              payload: {
+                systemPrompt:
+                  `${parent.payload.systemPrompt} ` +
+                  'When the question concerns a contested rule, you MUST cite the statute by section number, ' +
+                  'and you MUST refuse to answer if the rule is not in your active corpus.',
+              },
+            })),
+        },
+        gate: {
+          baselineKey: 'baseline',
+          minProductiveRuns: 3,
+          pairedDeltaThreshold: 0,
+          overfitGapThreshold: 0.5,
+          seed: 1234,
+        },
+        reps: 2,
+        generations: 2,
+        populationSize: 2,
+      },
+      releaseThresholds: {
+        requireCorpus: false,
+        minPassRate: 0.5,
+        minMeanScore: 0.5,
+        minSearchRuns: 1,
+        minHoldoutRuns: 1,
+        requireAsiForFailures: false,
+      },
+      ship: {
+        client: prClient,
+        repo: { owner: 'tangle-network', name: 'tax-agent' },
+        branchPrefix: 'eval/auto-improve',
+        promptFilePath: 'prompts/tax-agent-system.txt',
+        reviewers: ['drew'],
+        labels: ['production-loop'],
+      },
+      cron: { cadence: 'weekly' },
+    })
+
+    expect(result.decision).toBe('pr_opened')
+    expect(result.observedRunCount).toBe(8)
+    expect(result.observedFeedbackCount).toBe(8)
+    expect(result.actedOnCluster).not.toBeNull()
+    expect(result.actedOnCluster?.runCount).toBe(8)
+    expect(result.gate?.promote).toBe(true)
+    expect(result.release?.status).toBe('pass')
+    expect(result.pullRequest).not.toBeNull()
+    expect(captured).toHaveLength(1)
+
+    const pr = captured[0]
+    expect(pr).toBeDefined()
+    expect(pr?.fileChanges).toHaveLength(1)
+    expect(pr?.fileChanges[0]?.path).toBe('prompts/tax-agent-system.txt')
+    expect(pr?.fileChanges[0]?.contents).toContain('cite the statute')
+    expect(pr?.body).toContain('Triggering failure cluster')
+    expect(pr?.body).toContain('Held-out promotion gate')
+    expect(pr?.body).toContain('Release confidence')
+    expect(pr?.title).toContain('tax-agent: production-loop prompt update')
+  })
+})
diff --git a/tests/production-loop.test.ts b/tests/production-loop.test.ts
new file mode 100644
index 0000000..64d3291
--- /dev/null
+++ b/tests/production-loop.test.ts
@@ -0,0 +1,460 @@
+/**
+ * Production-loop tests.
+ *
+ * Regression coverage:
+ *   - cluster → evolve → propose-PR is deterministic with a fake-fetch
+ *     + fake-gh-api client
+ *   - the loop short-circuits to `no_actionable_failures` when no
+ *     cluster crosses the severity threshold
+ *   - the loop short-circuits to `gate_failed` when the held-out gate
+ *     rejects (fail-closed: a passing evolve does not auto-ship)
+ *   - the loop returns `proposed_change` when no `ship` is wired
+ *     (consumers may persist artifacts without opening a PR)
+ *   - PR body renders cluster details + gate evidence (regression: a
+ *     PR with no context is unreviewable)
+ *   - `AutoPrClient.proposeChange` is called exactly once on success
+ *
+ * No network. No `gh` shell-out. Fake clients verify the contract.
+ */
+import { describe, expect, it } from 'vitest'
+
+import type {
+  AutoPrClient,
+  ProposeAutomatedPullRequestInput,
+  ProposeAutomatedPullRequestResult,
+} from '../src/auto-pr'
+import { InMemoryFeedbackTrajectoryStore } from '../src/feedback-trajectory'
+import type {
+  MultiShotMutateAdapter,
+  MultiShotRunner,
+  MultiShotScorer,
+} from '../src/multi-shot-optimization'
+import { runProductionLoop } from '../src/production-loop'
+import { InMemoryTraceStore } from '../src/trace/store'
+import type { Scenario } from '../src/types'
+
+interface Payload {
+  systemPrompt: string
+}
+
+function scenario(id: string): Scenario {
+  return {
+    id,
+    persona: 'tax-filer',
+    label: id,
+    thesis: `Scenario ${id}`,
+    dimensions: ['correctness'],
+    turns: [{ user: 'Help me file my taxes', expectedBehaviors: ['gather state'] }],
+    artifactChecks: [],
+  }
+}
+
+/**
+ * Deterministic runner: scoring is a pure function of (systemPrompt
+ * length × scenarioId). A longer prompt wins, modeling the intuition
+ * that the evolve step is exploring richer prompts.
+ */
+function makeRunner(): MultiShotRunner<Payload> {
+  return {
+    run: ({ variant, scenarioId }) => ({
+      trace: {
+        scenarioId,
+        turns: [
+          { role: 'user', content: 'help' },
+          { role: 'assistant', content: variant.payload.systemPrompt },
+        ],
+        transcript: variant.payload.systemPrompt,
+      },
+      costUsd: 0.01,
+      durationMs: 5,
+    }),
+  }
+}
+
+function makeScorer(): MultiShotScorer<Payload> {
+  return {
+    score: ({ variant }) => {
+      const score = Math.min(1, variant.payload.systemPrompt.length / 200)
+      return { score, ok: score >= 0.6 }
+    },
+  }
+}
+
+function makeImprovementMutator(): MultiShotMutateAdapter<Payload> {
+  return {
+    mutate: async ({ parent, childCount, generation }) => {
+      const longer = `${parent.payload.systemPrompt} `
+        + 'Always cite the source statute by section number. '
+        + 'Refuse to answer if the FTC rule cited is not in the active corpus. '
+      return Array.from({ length: childCount }, (_, i) => ({
+        id: `${parent.id}-improved-${generation}-${i}`,
+        label: 'improved',
+        generation,
+        parentId: parent.id,
+        payload: { systemPrompt: longer },
+      }))
+    },
+  }
+}
+
+function makeIdentityMutator(): MultiShotMutateAdapter<Payload> {
+  return {
+    mutate: async () => [],
+  }
+}
+
+function makeFailedRun(traceStore: InMemoryTraceStore, runId: string, scenarioId: string): void {
+  traceStore.appendRun({
+    runId,
+    scenarioId,
+    startedAt: Date.now(),
+    endedAt: Date.now() + 1,
+    status: 'failed',
+    outcome: { pass: false, score: 0.1, failureClass: 'reasoning_error' },
+  })
+}
+
+function fakeAutoPrClient(): { client: AutoPrClient; calls: ProposeAutomatedPullRequestInput[] } {
+  const calls: ProposeAutomatedPullRequestInput[] = []
+  const client: AutoPrClient = {
+    proposeChange(input): Promise<ProposeAutomatedPullRequestResult> {
+      calls.push(input)
+      return Promise.resolve({
+        prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/pull/42`,
+        branchName: input.branchName,
+        headSha: 'cafe1234'.padEnd(40, '0'),
+        dryRun: false,
+      })
+    },
+  }
+  return { client, calls }
+}
+
+describe('runProductionLoop', () => {
+  it('returns no_actionable_failures when the failure cluster is below the severity threshold', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    // 1 failure out of 1 run, but minClusterSize=5 forces no action.
+    makeFailedRun(traceStore, 'r-1', 'noisy')
+
+    const result = await runProductionLoop<Payload>({
+      runId: 'prod-loop-quiet',
+      target: 'tax-agent',
+      traceStore,
+      feedbackStore,
+      cluster: { minClusterSize: 5, minSeverityRatio: 0.5 },
+      evolve: {
+        runner: makeRunner(),
+        scorer: makeScorer(),
+        mutator: makeImprovementMutator(),
+        baselinePrompt: { systemPrompt: 'You are a tax assistant.' },
+        holdoutScenarios: [scenario('hold-a'), scenario('hold-b')],
+        gate: { baselineKey: 'baseline', minProductiveRuns: 1, pairedDeltaThreshold: 0 },
+      },
+    })
+
+    expect(result.decision).toBe('no_actionable_failures')
+    expect(result.evolution).toBeNull()
+    expect(result.pullRequest).toBeNull()
+    expect(result.promotedPrompt).toEqual({ systemPrompt: 'You are a tax assistant.' })
+    expect(result.observedRunCount).toBe(1)
+  })
+
+  it('runs evolve and returns proposed_change when ship is not wired', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    // 5 failed runs in the same cluster — crosses minClusterSize=5.
+    for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge')
+
+    const result = await runProductionLoop<Payload>({
+      runId: 'prod-loop-evolve',
+      target: 'tax-agent',
+      traceStore,
+      feedbackStore,
+      cluster: { minClusterSize: 5, minSeverityRatio: 0 },
+      evolve: {
+        runner: makeRunner(),
+        scorer: makeScorer(),
+        mutator: makeImprovementMutator(),
+        baselinePrompt: { systemPrompt: 'Short.' },
+        holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')],
+        searchScenarios: [scenario('search-a'), scenario('search-b')],
+        gate: {
+          baselineKey: 'baseline',
+          minProductiveRuns: 1,
+          pairedDeltaThreshold: -1,
+          overfitGapThreshold: 1,
+          seed: 7,
+        },
+        reps: 1,
+        generations: 2,
+        populationSize: 2,
+      },
+      releaseThresholds: {
+        minPassRate: 0.0,
+        minMeanScore: 0.0,
+        minSearchRuns: 1,
+        minHoldoutRuns: 1,
+        maxOverfitGap: 1,
+        requireAsiForFailures: false,
+        requireCorpus: false,
+      },
+    })
+
+    expect(result.decision).toBe('proposed_change')
+    expect(result.evolution).not.toBeNull()
+    expect(result.actedOnCluster).not.toBeNull()
+    expect(result.pullRequest).toBeNull()
+    // The mutator extends the prompt — promoted must differ from baseline.
+    expect((result.promotedPrompt as Payload).systemPrompt).not.toBe('Short.')
+    expect((result.promotedPrompt as Payload).systemPrompt.length).toBeGreaterThan(
+      (result.baselinePrompt as Payload).systemPrompt.length,
+    )
+  })
+
+  it('opens a PR when ship is wired, gate passes, and release is green', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge')
+    const { client, calls } = fakeAutoPrClient()
+
+    const result = await runProductionLoop<Payload>({
+      runId: 'prod-loop-ship',
+      target: 'tax-agent',
+      traceStore,
+      feedbackStore,
+      cluster: { minClusterSize: 5, minSeverityRatio: 0 },
+      evolve: {
+        runner: makeRunner(),
+        scorer: makeScorer(),
+        mutator: makeImprovementMutator(),
+        baselinePrompt: { systemPrompt: 'Short.' },
+        holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')],
+        searchScenarios: [scenario('search-a'), scenario('search-b')],
+        gate: {
+          baselineKey: 'baseline',
+          minProductiveRuns: 1,
+          pairedDeltaThreshold: -1,
+          overfitGapThreshold: 1,
+          seed: 7,
+        },
+        reps: 1,
+        generations: 2,
+        populationSize: 2,
+      },
+      releaseThresholds: {
+        minPassRate: 0.0,
+        minMeanScore: 0.0,
+        minSearchRuns: 1,
+        minHoldoutRuns: 1,
+        maxOverfitGap: 1,
+        requireAsiForFailures: false,
+        requireCorpus: false,
+      },
+      ship: {
+        client,
+        repo: { owner: 'tangle-network', name: 'tax-agent' },
+        branchPrefix: 'eval/auto-improve',
+        promptFilePath: 'prompts/system.txt',
+        reviewers: ['drew'],
+        labels: ['production-loop'],
+      },
+      cron: { cadence: 'weekly' },
+    })
+
+    expect(result.decision).toBe('pr_opened')
+    expect(result.pullRequest).not.toBeNull()
+    expect(result.pullRequest?.prUrl).toContain('/pull/42')
+    expect(result.pullRequest?.branchName).toBe('eval/auto-improve/prod-loop-ship')
+    // The PR must carry the prompt file diff + cluster context.
+    expect(calls).toHaveLength(1)
+    expect(calls[0]?.fileChanges).toHaveLength(1)
+    expect(calls[0]?.fileChanges[0]?.path).toBe('prompts/system.txt')
+    expect(calls[0]?.body).toContain('Held-out promotion gate')
+    expect(calls[0]?.body).toContain('Release confidence')
+    expect(calls[0]?.body).toContain('Triggering failure cluster')
+    expect(calls[0]?.reviewers).toEqual(['drew'])
+    expect(calls[0]?.labels).toEqual(['production-loop'])
+    expect(result.cron?.cadence).toBe('weekly')
+  })
+
+  it('fails closed: when the held-out gate rejects, no PR is opened', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge')
+    const { client, calls } = fakeAutoPrClient()
+
+    // Mutator returns variants that SCORE WORSE — the gate will reject.
+    const worseMutator: MultiShotMutateAdapter<Payload> = {
+      mutate: async ({ parent, childCount, generation }) =>
+        Array.from({ length: childCount }, (_, i) => ({
+          id: `${parent.id}-worse-${generation}-${i}`,
+          label: 'worse',
+          generation,
+          parentId: parent.id,
+          payload: { systemPrompt: '' },
+        })),
+    }
+
+    const result = await runProductionLoop<Payload>({
+      runId: 'prod-loop-reject',
+      target: 'tax-agent',
+      traceStore,
+      feedbackStore,
+      cluster: { minClusterSize: 5, minSeverityRatio: 0 },
+      evolve: {
+        runner: makeRunner(),
+        scorer: makeScorer(),
+        mutator: worseMutator,
+        baselinePrompt: {
+          systemPrompt:
+            'You are a tax assistant. Cite sources. Refuse unsupported claims. Walk through state-by-state.',
+        },
+        holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')],
+        searchScenarios: [scenario('search-a'), scenario('search-b')],
+        gate: {
+          baselineKey: 'baseline',
+          minProductiveRuns: 1,
+          pairedDeltaThreshold: 0,
+          overfitGapThreshold: 1,
+          seed: 7,
+        },
+        reps: 1,
+        generations: 2,
+        populationSize: 2,
+      },
+      releaseThresholds: {
+        minPassRate: 0.0,
+        minMeanScore: 0.0,
+        minSearchRuns: 1,
+        minHoldoutRuns: 1,
+        maxOverfitGap: 1,
+        requireAsiForFailures: false,
+        requireCorpus: false,
+      },
+      ship: {
+        client,
+        repo: { owner: 'tangle-network', name: 'tax-agent' },
+        branchPrefix: 'eval/auto-improve',
+        promptFilePath: 'prompts/system.txt',
+      },
+    })
+
+    // Either evolve_yielded_no_improvement (when search-best stays baseline)
+    // or gate_failed (when search-best wins search but holdout fails). Both
+    // mean: no PR opened. That's the load-bearing assertion.
+    expect(['gate_failed', 'evolve_yielded_no_improvement']).toContain(result.decision)
+    expect(result.pullRequest).toBeNull()
+    expect(calls).toHaveLength(0)
+  })
+
+  it('returns evolve_yielded_no_improvement when the mutator returns no children', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge')
+
+    const result = await runProductionLoop<Payload>({
+      runId: 'prod-loop-noop',
+      target: 'tax-agent',
+      traceStore,
+      feedbackStore,
+      cluster: { minClusterSize: 5, minSeverityRatio: 0 },
+      evolve: {
+        runner: makeRunner(),
+        scorer: makeScorer(),
+        mutator: makeIdentityMutator(),
+        baselinePrompt: { systemPrompt: 'baseline-only.' },
+        holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')],
+        searchScenarios: [scenario('search-a'), scenario('search-b')],
+        gate: { baselineKey: 'baseline', minProductiveRuns: 1, pairedDeltaThreshold: 0, seed: 7 },
+        reps: 1,
+        generations: 1,
+        populationSize: 1,
+      },
+      releaseThresholds: { requireCorpus: false, requireAsiForFailures: false },
+    })
+
+    expect(result.decision).toBe('evolve_yielded_no_improvement')
+    expect(result.pullRequest).toBeNull()
+  })
+
+  it('validates inputs (regression: empty runId, empty holdout, conflicting search/holdout)', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+
+    await expect(
+      runProductionLoop<Payload>({
+        runId: '   ',
+        target: 'tax-agent',
+        traceStore,
+        feedbackStore,
+        cluster: {},
+        evolve: {
+          runner: makeRunner(),
+          scorer: makeScorer(),
+          mutator: makeImprovementMutator(),
+          baselinePrompt: { systemPrompt: 'x' },
+          holdoutScenarios: [scenario('h-a')],
+          gate: { baselineKey: 'baseline' },
+        },
+      }),
+    ).rejects.toThrow(/runId required/)
+
+    await expect(
+      runProductionLoop<Payload>({
+        runId: 'r',
+        target: '   ',
+        traceStore,
+        feedbackStore,
+        cluster: {},
+        evolve: {
+          runner: makeRunner(),
+          scorer: makeScorer(),
+          mutator: makeImprovementMutator(),
+          baselinePrompt: { systemPrompt: 'x' },
+          holdoutScenarios: [scenario('h-a')],
+          gate: { baselineKey: 'baseline' },
+        },
+      }),
+    ).rejects.toThrow(/target required/)
+
+    await expect(
+      runProductionLoop<Payload>({
+        runId: 'r',
+        target: 'tax-agent',
+        traceStore,
+        feedbackStore,
+        cluster: {},
+        evolve: {
+          runner: makeRunner(),
+          scorer: makeScorer(),
+          mutator: makeImprovementMutator(),
+          baselinePrompt: { systemPrompt: 'x' },
+          holdoutScenarios: [],
+          gate: { baselineKey: 'baseline' },
+        },
+      }),
+    ).rejects.toThrow(/holdoutScenarios must not be empty/)
+
+    // Search/holdout overlap.
+    for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge')
+    await expect(
+      runProductionLoop<Payload>({
+        runId: 'r',
+        target: 'tax-agent',
+        traceStore,
+        feedbackStore,
+        cluster: { minClusterSize: 5, minSeverityRatio: 0 },
+        evolve: {
+          runner: makeRunner(),
+          scorer: makeScorer(),
+          mutator: makeImprovementMutator(),
+          baselinePrompt: { systemPrompt: 'x' },
+          holdoutScenarios: [scenario('a')],
+          searchScenarios: [scenario('a')],
+          gate: { baselineKey: 'baseline' },
+        },
+      }),
+    ).rejects.toThrow(/disjoint/)
+  })
+})
diff --git a/tests/wire-ingestion.test.ts b/tests/wire-ingestion.test.ts
new file mode 100644
index 0000000..bdca59e
--- /dev/null
+++ b/tests/wire-ingestion.test.ts
@@ -0,0 +1,295 @@
+/**
+ * Wire-protocol ingestion tests (0.25.0).
+ *
+ * Regression coverage:
+ *   - POST /v1/feedback persists into the configured FeedbackTrajectoryStore
+ *   - POST /v1/feedback returns 400 ValidationError on malformed payload
+ *   - POST /v1/feedback returns 503 when no store is wired
+ *   - POST /v1/traces/ingest accepts both JSON ({events:[...]}) and NDJSON
+ *   - POST /v1/traces/ingest reports per-event errors without poisoning the batch
+ *   - Bearer auth (when configured) blocks ingestion without a valid token
+ *     but never blocks /healthz or /v1/version
+ *   - OpenAPI doc lists the new endpoints + components
+ */
+import { describe, expect, it } from 'vitest'
+
+import { InMemoryFeedbackTrajectoryStore } from '../src/feedback-trajectory'
+import { InMemoryTraceStore } from '../src/trace/store'
+import { createApp } from '../src/wire/server'
+
+async function postJson(app: ReturnType<typeof createApp>, path: string, body: unknown, headers?: Record<string, string>) {
+  const res = await app.fetch(
+    new Request(`http://localhost${path}`, {
+      method: 'POST',
+      headers: { 'content-type': 'application/json', ...headers },
+      body: JSON.stringify(body),
+    }),
+  )
+  const responseBody = res.status === 204 ? null : await res.json().catch(() => null)
+  return { status: res.status, body: responseBody }
+}
+
+async function postRaw(
+  app: ReturnType<typeof createApp>,
+  path: string,
+  body: string,
+  contentType: string,
+  headers?: Record<string, string>,
+) {
+  const res = await app.fetch(
+    new Request(`http://localhost${path}`, {
+      method: 'POST',
+      headers: { 'content-type': contentType, ...headers },
+      body,
+    }),
+  )
+  const responseBody = res.status === 204 ? null : await res.json().catch(() => null)
+  return { status: res.status, body: responseBody }
+}
+
+function validFeedback(id = 'ft_1') {
+  return {
+    id,
+    task: { intent: 'help filing 2025 taxes' },
+    attempts: [],
+    labels: [
+      {
+        source: 'user',
+        kind: 'approve',
+        value: { thumb: 'up' },
+        createdAt: '2026-05-14T00:00:00Z',
+      },
+    ],
+    createdAt: '2026-05-14T00:00:00Z',
+  }
+}
+
+describe('POST /v1/feedback', () => {
+  it('persists a feedback trajectory and returns 200 with id + persisted:true', async () => {
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    const app = createApp({ stores: { feedbackStore } })
+
+    const r = await postJson(app, '/v1/feedback', validFeedback('ft_persist'))
+    expect(r.status).toBe(200)
+    expect((r.body as { id: string }).id).toBe('ft_persist')
+    expect((r.body as { persisted: boolean }).persisted).toBe(true)
+
+    const stored = await feedbackStore.get('ft_persist')
+    expect(stored).not.toBeNull()
+    expect(stored?.task.intent).toBe('help filing 2025 taxes')
+  })
+
+  it('returns 400 ValidationError on malformed payload (regression: silent 200 on bad input)', async () => {
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    const app = createApp({ stores: { feedbackStore } })
+
+    const r = await postJson(app, '/v1/feedback', { id: 'x' /* missing task */ })
+    expect(r.status).toBe(400)
+    expect((r.body as { error: { code: string } }).error.code).toBe('validation_error')
+  })
+
+  it('returns 503 when no feedback store is configured', async () => {
+    const app = createApp() // no stores
+    const r = await postJson(app, '/v1/feedback', validFeedback())
+    expect(r.status).toBe(503)
+    expect((r.body as { error: { code: string } }).error.code).toBe('service_unavailable')
+  })
+
+  it('is idempotent on id (re-posting replaces)', async () => {
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    const app = createApp({ stores: { feedbackStore } })
+
+    await postJson(app, '/v1/feedback', validFeedback('ft_dup'))
+    const second = await postJson(app, '/v1/feedback', {
+      ...validFeedback('ft_dup'),
+      task: { intent: 'NEW intent' },
+    })
+    expect(second.status).toBe(200)
+    const stored = await feedbackStore.get('ft_dup')
+    expect(stored?.task.intent).toBe('NEW intent')
+  })
+})
+
+describe('POST /v1/traces/ingest', () => {
+  it('accepts JSON body and persists events', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const app = createApp({ stores: { traceStore } })
+
+    const r = await postJson(app, '/v1/traces/ingest', {
+      events: [
+        {
+          eventId: 'e1',
+          runId: 'r1',
+          kind: 'log',
+          timestamp: 1_700_000_000_000,
+          payload: { msg: 'hello' },
+        },
+        {
+          eventId: 'e2',
+          runId: 'r1',
+          kind: 'error',
+          timestamp: 1_700_000_001_000,
+          payload: { msg: 'boom' },
+        },
+      ],
+    })
+
+    expect(r.status).toBe(200)
+    expect((r.body as { accepted: number }).accepted).toBe(2)
+    expect((r.body as { rejected: number }).rejected).toBe(0)
+    const events = await traceStore.events({ runId: 'r1' })
+    expect(events).toHaveLength(2)
+    expect(events.map((e) => e.eventId).sort()).toEqual(['e1', 'e2'])
+  })
+
+  it('accepts NDJSON body and persists events', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const app = createApp({ stores: { traceStore } })
+
+    const ndjson = [
+      JSON.stringify({
+        eventId: 'n1',
+        runId: 'r2',
+        kind: 'log',
+        timestamp: 1,
+        payload: { line: 1 },
+      }),
+      JSON.stringify({
+        eventId: 'n2',
+        runId: 'r2',
+        kind: 'log',
+        timestamp: 2,
+        payload: { line: 2 },
+      }),
+      '',
+    ].join('\n')
+
+    const r = await postRaw(app, '/v1/traces/ingest', ndjson, 'application/x-ndjson')
+    expect(r.status).toBe(200)
+    expect((r.body as { accepted: number }).accepted).toBe(2)
+
+    const events = await traceStore.events({ runId: 'r2' })
+    expect(events).toHaveLength(2)
+  })
+
+  it('returns 400 on malformed schema', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const app = createApp({ stores: { traceStore } })
+    const r = await postJson(app, '/v1/traces/ingest', { events: [{ eventId: '', runId: 'r1' }] })
+    expect(r.status).toBe(400)
+    expect((r.body as { error: { code: string } }).error.code).toBe('validation_error')
+  })
+
+  it('returns 503 when no trace store is configured', async () => {
+    const app = createApp()
+    const r = await postJson(app, '/v1/traces/ingest', {
+      events: [{ eventId: 'e', runId: 'r', kind: 'log', timestamp: 1, payload: {} }],
+    })
+    expect(r.status).toBe(503)
+  })
+})
+
+describe('bearer auth (opt-in)', () => {
+  it('blocks ingestion without an Authorization header', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    const app = createApp({
+      stores: { traceStore, feedbackStore },
+      auth: { bearer: 'sk-prod-1234' },
+    })
+
+    const r = await postJson(app, '/v1/feedback', validFeedback())
+    expect(r.status).toBe(401)
+    const r2 = await postJson(app, '/v1/traces/ingest', {
+      events: [{ eventId: 'e', runId: 'r', kind: 'log', timestamp: 1, payload: {} }],
+    })
+    expect(r2.status).toBe(401)
+  })
+
+  it('accepts a valid bearer token', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    const app = createApp({
+      stores: { traceStore, feedbackStore },
+      auth: { bearer: 'sk-prod-1234' },
+    })
+    const r = await postJson(app, '/v1/feedback', validFeedback(), {
+      authorization: 'Bearer sk-prod-1234',
+    })
+    expect(r.status).toBe(200)
+  })
+
+  it('rejects a wrong bearer token', async () => {
+    const traceStore = new InMemoryTraceStore()
+    const app = createApp({
+      stores: { traceStore },
+      auth: { bearer: 'sk-prod-1234' },
+    })
+    const r = await postJson(
+      app,
+      '/v1/traces/ingest',
+      { events: [{ eventId: 'e', runId: 'r', kind: 'log', timestamp: 1, payload: {} }] },
+      { authorization: 'Bearer wrong' },
+    )
+    expect(r.status).toBe(401)
+  })
+
+  it('always exempts /healthz and /v1/version (regression: lock-out from monitoring)', async () => {
+    const app = createApp({
+      auth: { bearer: 'sk-prod-1234' },
+      stores: {},
+    })
+    const health = await app.fetch(new Request('http://localhost/healthz'))
+    expect(health.status).toBe(200)
+    const version = await app.fetch(new Request('http://localhost/v1/version'))
+    expect(version.status).toBe(200)
+  })
+
+  it('supports a verifier function for rotating tokens', async () => {
+    const feedbackStore = new InMemoryFeedbackTrajectoryStore()
+    let calledWith: string | undefined
+    const app = createApp({
+      stores: { feedbackStore },
+      auth: {
+        bearer: (token: string) => {
+          calledWith = token
+          return token.startsWith('rotating-')
+        },
+      },
+    })
+
+    const ok = await postJson(app, '/v1/feedback', validFeedback(), {
+      authorization: 'Bearer rotating-abc',
+    })
+    expect(ok.status).toBe(200)
+    expect(calledWith).toBe('rotating-abc')
+
+    const bad = await postJson(app, '/v1/feedback', validFeedback('ft_2'), {
+      authorization: 'Bearer static-token',
+    })
+    expect(bad.status).toBe(401)
+  })
+})
+
+describe('OpenAPI spec', () => {
+  it('lists the new ingestion endpoints + component schemas', async () => {
+    const app = createApp()
+    const res = await app.fetch(new Request('http://localhost/openapi.json'))
+    expect(res.status).toBe(200)
+    const spec = (await res.json()) as {
+      paths: Record<string, unknown>
+      components: { schemas: Record<string, unknown> }
+    }
+    expect(Object.keys(spec.paths)).toEqual(
+      expect.arrayContaining(['/v1/feedback', '/v1/traces/ingest']),
+    )
+    expect(Object.keys(spec.components.schemas)).toEqual(
+      expect.arrayContaining([
+        'FeedbackTrajectory',
+        'FeedbackIngestResponse',
+        'TracesIngestRequest',
+        'TracesIngestResponse',
+      ]),
+    )
+  })
+})