From 5166cd7dd9bf4c1316bce81da337c7cc59f7e532 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Thu, 14 May 2026 11:27:07 -0600 Subject: [PATCH] =?UTF-8?q?feat(0.25.0):=20ProductionLoop=20primitive=20?= =?UTF-8?q?=E2=80=94=20close=20the=20eval=E2=86=92prod=E2=86=92eval=20cycl?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pieces to close the loop were already in the package (runMultiShotOptimization, failureClusterView, evaluateReleaseConfidence, extractPreferences, FeedbackTrajectoryStore, TraceStore). This release adds the one clean primitive that wires them end-to-end. - runProductionLoop({...}): one call = one cycle. Ingest prod traces + feedback, cluster failures, evolve against the worst cluster, gate fail-closed, open a PR with the new prompt. Idempotent + replayable; cron is the consumer's job. - proposeAutomatedPullRequest + ghCliClient / httpGithubClient (no new deps; both transports tested with fakes). - POST /v1/feedback + POST /v1/traces/ingest (NDJSON-capable) wire endpoints. Optional bearer auth (healthz / version stay exempt). - examples/production-loop: runnable synthetic end-to-end demo. - 33 new tests (1052 total, up from 1019). Typecheck + biome + build clean. openapi.json regenerated. --- CHANGELOG.md | 65 +++ README.md | 71 +++ clients/python/pyproject.toml | 2 +- examples/production-loop/README.md | 79 +++ examples/production-loop/index.ts | 292 +++++++++++ package.json | 2 +- src/auto-pr.ts | 509 +++++++++++++++++++ src/index.ts | 36 ++ src/production-loop.ts | 679 ++++++++++++++++++++++++++ src/wire/handlers.ts | 85 +++- src/wire/openapi.ts | 75 +++ src/wire/schemas.ts | 165 +++++++ src/wire/server.ts | 129 ++++- tests/auto-pr.test.ts | 290 +++++++++++ tests/production-loop-example.test.ts | 186 +++++++ tests/production-loop.test.ts | 460 +++++++++++++++++ tests/wire-ingestion.test.ts | 295 +++++++++++ 17 files changed, 3409 insertions(+), 11 deletions(-) create mode 100644 examples/production-loop/README.md create mode 100644 examples/production-loop/index.ts create mode 100644 src/auto-pr.ts create mode 100644 src/production-loop.ts create mode 100644 tests/auto-pr.test.ts create mode 100644 tests/production-loop-example.test.ts create mode 100644 tests/production-loop.test.ts create mode 100644 tests/wire-ingestion.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 452f9e8..c56c397 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,70 @@ # Changelog +## 0.25.0 — ProductionLoop primitive: close the eval → prod → eval cycle + +This release ships the **orchestration layer** that turns the existing +eval substrate into a continuously-improving production system. Static +prompts decay; today's regulation flips tomorrow. The pieces to close +the loop were already in the package (`runMultiShotOptimization`, +`failureClusterView`, `evaluateReleaseConfidence`, `extractPreferences`, +`FeedbackTrajectoryStore`, `TraceStore`); this release adds the one +clean primitive that wires them together end-to-end. + +### Added + +- **`runProductionLoop({ ... })`** (`src/production-loop.ts`, + `@experimental`) — one call = one cycle. Ingests production traces + and feedback, clusters failures, runs evolve against the worst + cluster, gates with `HeldOutGate` + `evaluateReleaseConfidence` + (fail-closed), and — when wired with an `AutoPrClient` — opens a PR + with the improved prompt. Idempotent + replayable: same `runId` + yields the same plan. Cron / GitHub Actions are the consumer's job; + the primitive doesn't own scheduling. + +- **`proposeAutomatedPullRequest(client, input)`** + two transports + (`src/auto-pr.ts`, `@experimental`): + - `httpGithubClient({ token, ... })` — direct REST against + `api.github.com`, no extra deps. Idempotent on branch name: + existing open PRs are returned, not duplicated. + - `ghCliClient({ ... })` — shells out to `gh` for environments + where developer auth state is already configured. + Both validate inputs (no `..` paths, no whitespace branches, no + duplicate file changes) and surface `ValidationError` / `ConfigError` + from the typed taxonomy. + +- **`POST /v1/feedback` + `POST /v1/traces/ingest`** wire endpoints + (`src/wire/`). Both Zod-validated, both append to the configured + store (`FeedbackTrajectoryStore` / `TraceStore`). 503 when no store + is wired (fail loud, not silent). Traces ingest accepts both + `application/json` (`{events:[...]}`) and `application/x-ndjson` for + streaming production runtimes. Schemas (`TraceEvent`, + `FeedbackTrajectory`, `TracesIngestRequest/Response`, + `FeedbackIngestResponse`) added to `openapi.json` for cross-language + clients. + +- **Optional bearer-token auth** on the wire server, configured via + `createApp({ auth: { bearer: '...' } })` or as a verifier function + for rotating tokens. `/healthz` and `/v1/version` remain unprotected + (regression: never lock monitoring out of the runtime). + +- **`examples/production-loop/`** — synthetic end-to-end demo wiring + the loop against in-memory trace + feedback stores and a fake + auto-PR client. Shows the failure-cluster trigger, the evolve round, + the gate verdict, and the PR-shaped output without requiring + credentials or a live model. + +### Changed + +- **Wire server** (`createApp(opts)`) now accepts optional + `IngestionStores` (`{ traceStore?, feedbackStore? }`) and `auth`. + Existing zero-arg callers continue to work — judge / rubrics / + version / healthz are unchanged. + +### Status tags + +- Every new export is `@experimental` initially. Pin the patch version + if you depend on it. All other 0.24.0 stability tags are preserved. + ## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices This release is **DX + correctness**. No production behavior moved; consumer diff --git a/README.md b/README.md index 0a11bf8..80ae63d 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,75 @@ await product.storeEvalResult(task.id, result) Same loop shape in production, replay, benchmark, and optimization. Swap the dependencies behind `observe()` and `act()`, never the eval contract. +## Production loop — close the eval → prod → eval cycle (0.25.0) + +Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk +becomes today's incident. The production agents that win are the ones that +**continuously re-train against live failure modes**. + +`runProductionLoop` is the orchestration layer that wires the existing eval +substrate into a self-improvement cron: + +```ts +import { + runProductionLoop, + httpGithubClient, + FileSystemFeedbackTrajectoryStore, +} from '@tangle-network/agent-eval' +import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces' + +const result = await runProductionLoop({ + runId: `weekly-${new Date().toISOString().slice(0, 10)}`, + target: 'tax-agent', + + // 1. Where production traces + feedback land. Wire the HTTP ingestion + // endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your + // runtime; the same store reads them here. + traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }), + feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }), + + // 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus. + cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 }, + + // 3. Evolve: seed = current prompt, gate against holdout scenarios. + evolve: { + baselinePrompt: currentSystemPrompt, + holdoutScenarios: productionShapeScenarios, + runner, // your agent driver + scorer, // calibrated judge or rubric + mutator, // GEPA-style or addendum-style mutator + gate: { + baselineKey: 'baseline', + minProductiveRuns: 5, + pairedDeltaThreshold: 0.03, // require Nσ improvement on holdout + overfitGapThreshold: 0.10, + }, + }, + + // 4. Ship: when the gate passes, open a PR with the new prompt. + ship: { + client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }), + repo: { owner: 'tangle-network', name: 'tax-agent' }, + branchPrefix: 'eval/auto-improve', + promptFilePath: 'prompts/tax-agent-system.txt', + reviewers: ['drew'], + }, + + cron: { cadence: 'weekly' }, // surface-only; consumer schedules +}) + +console.log(result.decision) // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ... +console.log(result.pullRequest?.prUrl) // populated when a PR was opened +``` + +The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in +GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan. +Gate failures are fail-closed — a candidate that beats baseline on search but +overfits on holdout never lands. + +Full runnable demo (synthetic traces, no credentials) in +[`examples/production-loop`](./examples/production-loop/README.md). + ## Self-improvement loop Eval doesn't end at "pass/fail." Outcomes become training signal, mutation @@ -222,6 +291,8 @@ and runtime. See [`examples/`](./examples/). closed loop — score, reflect, mutate, re-score, repeat. - [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md): RunRecord → preferences → trainer (prime-rl) → next campaign. +- [`examples/production-loop`](./examples/production-loop/README.md): + ingest prod traces + feedback, cluster failures, evolve, gate, open a PR. ## Docs diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 8308d99..e825c86 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agent-eval-rpc" -version = "0.24.0" +version = "0.25.0" description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client." readme = "README.md" requires-python = ">=3.10" diff --git a/examples/production-loop/README.md b/examples/production-loop/README.md new file mode 100644 index 0000000..5741c3d --- /dev/null +++ b/examples/production-loop/README.md @@ -0,0 +1,79 @@ +# Production loop + +End-to-end demo of `runProductionLoop` — the orchestration layer that +closes eval → prod → eval. + +## What it shows + +- 8 synthetic production failures (all hitting the same `instruction_following` + failure class — missing statute citations on FTC rule questions) seeded + into an `InMemoryTraceStore`. +- 8 matching 👎 user-feedback labels seeded into an + `InMemoryFeedbackTrajectoryStore`. +- One `runProductionLoop` cycle: + - `failureClusterView` surfaces the cluster, which crosses the + `minClusterSize: 5` threshold. + - `runMultiShotOptimization` runs 2 generations × 2 reps over 3 + holdout scenarios, with an addendum-style mutator that appends a + citation directive to the baseline prompt. + - `HeldOutGate` checks that the paired-Δ on the holdout split is + positive with `minProductiveRuns: 3`. + - `evaluateReleaseConfidence` cross-checks pass-rate, mean score, + overfit gap, and the gate decision (fail-closed on any axis). + - On pass, a fake `AutoPrClient` captures the PR plan — a real + deployment would wire `httpGithubClient({ token })` or + `ghCliClient()`. + +## Run + +```sh +pnpm tsx examples/production-loop/index.ts +``` + +## Expected output + +``` +═══════════════════════════════════════════════════════════════ +production-loop demo · synthetic prod data → improved prompt +═══════════════════════════════════════════════════════════════ +runId : prod-loop-demo- +target : tax-agent +decision : pr_opened +observed runs : 8 +observed feedback: 8 +clusters seen : 1 +acted-on : class=instruction_following runs=8 scenarios=1 +gate : promote=true medianΔ=0.450 CI=[0.450, 0.450] +release status : pass (passRate=...) +─────────────────────────────────────────────────────────────── +PR opened : https://github.com/tangle-network/tax-agent/pull/synthetic-1 +branch : eval/auto-improve/prod-loop-demo- +head SHA : face-cafe-beef-... +─────────────────────────────────────────────────────────────── +PR title: tax-agent: production-loop prompt update (prod-loop-demo-) +PR file: prompts/tax-agent-system.txt +PR body preview: + ## Production-loop prompt update — `tax-agent` + + Run id: `prod-loop-demo-` + Decision: `pr_opened` + Observed in this cycle: 8 prod runs, 8 feedback trajectories. + + ### Triggering failure cluster + ... +═══════════════════════════════════════════════════════════════ +``` + +## Adapt this to your product + +| Synthetic | Production | +| ------------------------------- | --------------------------------------------------- | +| `InMemoryTraceStore` | `FileSystemTraceStore`, or HTTP-ingest via `POST /v1/traces/ingest` | +| `InMemoryFeedbackTrajectoryStore` | `FileSystemFeedbackTrajectoryStore`, or HTTP-ingest via `POST /v1/feedback` | +| deterministic `runner` | your agent driver invoking real tools | +| deterministic `scorer` | calibrated judge (`callLlmJson` + `Rubric`) | +| `captureAutoPrClient()` | `httpGithubClient({ token })` or `ghCliClient()` | +| `main()` | scheduled GitHub Action (`workflow_dispatch` + cron) | + +The primitive is **idempotent** + **replayable**: re-running with the +same `runId` produces the same plan. Safe to retry on transient errors. diff --git a/examples/production-loop/index.ts b/examples/production-loop/index.ts new file mode 100644 index 0000000..081074a --- /dev/null +++ b/examples/production-loop/index.ts @@ -0,0 +1,292 @@ +/** + * Production loop — runnable end-to-end demo. + * + * What this shows: + * - Feed 8 synthetic production failures into a TraceStore + matching + * user-feedback labels into a FeedbackTrajectoryStore. + * - Configure a `runProductionLoop` cycle: + * cluster threshold = 5 runs / 5% severity, + * evolve = 2 generations × 2 reps over 3 holdout scenarios, + * gate = paired-Δ > 0 over 3 productive runs, + * ship = a fake `AutoPrClient` that captures the PR plan. + * - Print the loop's decision, the gate evidence, and the + * PR-shaped artifact. + * + * No network. No credentials. Run with: + * + * pnpm tsx examples/production-loop/index.ts + * + * In a real wiring you would: + * - Replace the in-memory stores with a `FileSystemTraceStore` and + * `FileSystemFeedbackTrajectoryStore` shared with the production + * runtime (or wire the HTTP ingestion endpoints). + * - Replace the deterministic runner with your actual agent driver. + * - Replace the deterministic scorer with a calibrated judge + * (`callLlmJson` + a Rubric, or any `MultiShotScorer`). + * - Wire a real `AutoPrClient`: `httpGithubClient({ token: GITHUB_TOKEN })` + * in CI, or `ghCliClient()` for developer machines. + * - Trigger the loop via a `workflow_dispatch` or scheduled GitHub + * Action — the primitive runs ONE cycle; cron is the consumer's job. + */ + +import type { + AutoPrClient, + ProposeAutomatedPullRequestInput, + ProposeAutomatedPullRequestResult, +} from '../../src/auto-pr' +import { InMemoryFeedbackTrajectoryStore } from '../../src/feedback-trajectory' +import { runProductionLoop } from '../../src/production-loop' +import { InMemoryTraceStore } from '../../src/trace/store' +import type { Scenario } from '../../src/types' + +// ── 1. Domain types ────────────────────────────────────────────────── + +interface TaxAgentPayload { + systemPrompt: string +} + +function scenario(id: string, persona: string): Scenario { + return { + id, + persona, + label: id, + thesis: `Filing scenario: ${id}`, + dimensions: ['correctness', 'citation_quality'], + turns: [ + { + user: 'Help me file my taxes given my W-2 + 1099-NEC for 2025.', + expectedBehaviors: ['gather state', 'cite a statute', 'flag missing forms'], + }, + ], + artifactChecks: [], + } +} + +// ── 2. Seed production telemetry into the stores ───────────────────── + +async function seedProductionState() { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + + // 8 prod runs that all blew up on the same root cause: the agent + // failed to cite a statute when an FTC rule was contested. + for (let i = 0; i < 8; i++) { + await traceStore.appendRun({ + runId: `prod-run-${i}`, + scenarioId: 'ftc-noncompete-edge', + startedAt: Date.now() - 3_600_000 + i * 10_000, + endedAt: Date.now() - 3_600_000 + i * 10_000 + 500, + status: 'failed', + outcome: { + pass: false, + score: 0.2, + failureClass: 'instruction_following', + notes: 'Cited no statute; rubric requires section number on contested rules.', + }, + }) + // Matching 👎 feedback from the user. + await feedbackStore.save({ + id: `ft-${i}`, + scenarioId: 'ftc-noncompete-edge', + task: { intent: 'Explain whether the FTC non-compete rule applies to my contract.' }, + attempts: [], + labels: [ + { + source: 'user', + kind: 'reject', + value: { thumb: 'down', complaint: 'No citation. Made things up.' }, + severity: 'error', + createdAt: new Date().toISOString(), + }, + ], + createdAt: new Date().toISOString(), + }) + } + + return { traceStore, feedbackStore } +} + +// ── 3. Fake AutoPrClient that captures the PR plan ────────────────── + +function captureAutoPrClient(): { + client: AutoPrClient + captured: ProposeAutomatedPullRequestInput[] +} { + const captured: ProposeAutomatedPullRequestInput[] = [] + const client: AutoPrClient = { + proposeChange(input): Promise { + captured.push(input) + return Promise.resolve({ + prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/pull/synthetic-1`, + branchName: input.branchName, + headSha: 'face-cafe-beef-1234'.padEnd(40, '0'), + dryRun: false, + }) + }, + } + return { client, captured } +} + +// ── 4. The loop ────────────────────────────────────────────────────── + +async function main(): Promise { + const { traceStore, feedbackStore } = await seedProductionState() + const { client: prClient, captured: prCaptured } = captureAutoPrClient() + + const baselinePrompt = + 'You are a tax assistant. Be helpful and concise. ' + + 'Answer questions about US tax forms and rules.' + + const holdoutScenarios = [ + scenario('ftc-noncompete-edge', 'small-biz-owner'), + scenario('schedule-c-self-employed', 'freelancer'), + scenario('w2-multistate', 'remote-worker'), + ] + const searchScenarios = [ + scenario('basic-w2', 'salaried'), + scenario('joint-return', 'married'), + ] + + const result = await runProductionLoop({ + runId: `prod-loop-demo-${Date.now()}`, + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { + minClusterSize: 5, + minSeverityRatio: 0.05, + maxClustersPerCycle: 1, + }, + evolve: { + baselinePrompt: { systemPrompt: baselinePrompt }, + holdoutScenarios, + searchScenarios, + // Deterministic runner: trace length scales with prompt length, modeling + // "longer prompts elicit more deliberate reasoning." + runner: { + run: ({ variant, scenarioId }) => ({ + trace: { + scenarioId, + turns: [ + { role: 'user', content: 'help' }, + { role: 'assistant', content: variant.payload.systemPrompt }, + ], + transcript: variant.payload.systemPrompt, + }, + costUsd: 0.01, + durationMs: 5, + }), + }, + // Deterministic scorer: a prompt that mentions "cite" scores higher. + scorer: { + score: ({ variant }) => { + const lengthSignal = Math.min(1, variant.payload.systemPrompt.length / 300) + const citationBonus = /cite/i.test(variant.payload.systemPrompt) ? 0.3 : 0 + const refuseBonus = /refuse/i.test(variant.payload.systemPrompt) ? 0.15 : 0 + const score = Math.min(1, 0.3 + lengthSignal * 0.4 + citationBonus + refuseBonus) + return { score, ok: score >= 0.6 } + }, + }, + // Addendum-style mutator: append a citation directive. + mutator: { + mutate: async ({ parent, childCount, generation }) => + Array.from({ length: childCount }, (_, i) => ({ + id: `${parent.id}-cite-${generation}-${i}`, + label: 'cite-required', + generation, + parentId: parent.id, + payload: { + systemPrompt: + `${parent.payload.systemPrompt} ` + + 'When the question concerns a contested rule (FTC, IRS, state tax authority), ' + + 'you MUST cite the statute or regulation by section number, and you MUST refuse ' + + 'to answer if the rule is not present in your active corpus.', + }, + rationale: 'Address `instruction_following` failure cluster: missing citations.', + })), + }, + gate: { + baselineKey: 'baseline', + minProductiveRuns: 3, + pairedDeltaThreshold: 0, + overfitGapThreshold: 0.5, + seed: 1234, + }, + reps: 2, + generations: 2, + populationSize: 2, + }, + releaseThresholds: { + requireCorpus: false, + minPassRate: 0.5, + minMeanScore: 0.5, + minSearchRuns: 1, + minHoldoutRuns: 1, + requireAsiForFailures: false, + }, + ship: { + client: prClient, + repo: { owner: 'tangle-network', name: 'tax-agent' }, + branchPrefix: 'eval/auto-improve', + promptFilePath: 'prompts/tax-agent-system.txt', + baseBranch: 'main', + reviewers: ['drew'], + labels: ['production-loop', 'auto-improve'], + }, + cron: { cadence: 'weekly', jitterSec: 600 }, + }) + + // ── Render ────────────────────────────────────────────────────────── + console.log('═══════════════════════════════════════════════════════════════') + console.log('production-loop demo · synthetic prod data → improved prompt') + console.log('═══════════════════════════════════════════════════════════════') + console.log(`runId : ${result.runId}`) + console.log(`target : ${result.target}`) + console.log(`decision : ${result.decision}`) + console.log(`observed runs : ${result.observedRunCount}`) + console.log(`observed feedback: ${result.observedFeedbackCount}`) + console.log(`clusters seen : ${result.clusters.length}`) + if (result.actedOnCluster) { + console.log(`acted-on : class=${result.actedOnCluster.failureClass} ` + + `runs=${result.actedOnCluster.runCount} ` + + `scenarios=${result.actedOnCluster.scenarioIds.length}`) + } + if (result.gate) { + console.log(`gate : promote=${result.gate.promote} ` + + `medianΔ=${result.gate.evidence.medianPairedDelta.toFixed(3)} ` + + `CI=[${result.gate.evidence.pairedCI.low.toFixed(3)}, ` + + `${result.gate.evidence.pairedCI.high.toFixed(3)}]`) + } + if (result.release) { + console.log(`release status : ${result.release.status} ` + + `(passRate=${result.release.metrics.passRate.toFixed(3)} ` + + `meanScore=${result.release.metrics.meanScore.toFixed(3)})`) + } + if (result.pullRequest) { + console.log('───────────────────────────────────────────────────────────────') + console.log(`PR opened : ${result.pullRequest.prUrl}`) + console.log(`branch : ${result.pullRequest.branchName}`) + console.log(`head SHA : ${result.pullRequest.headSha}`) + console.log('───────────────────────────────────────────────────────────────') + const pr = prCaptured[0] + if (pr) { + console.log('PR title:', pr.title) + console.log('PR file:', pr.fileChanges[0]?.path) + console.log('PR body preview:') + console.log( + pr.body + .split('\n') + .slice(0, 20) + .map((line) => ` ${line}`) + .join('\n'), + ) + console.log(' ...') + } + } + console.log('═══════════════════════════════════════════════════════════════') +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/package.json b/package.json index da70e68..a18ec3d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.24.0", + "version": "0.25.0", "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": { diff --git a/src/auto-pr.ts b/src/auto-pr.ts new file mode 100644 index 0000000..aaed655 --- /dev/null +++ b/src/auto-pr.ts @@ -0,0 +1,509 @@ +/** + * Automated pull request opener for the production loop. + * + * `runProductionLoop` produces a `promotedPrompt` string and a release + * scorecard. To close the eval → prod → eval cycle the framework needs + * to land that prompt as a reviewable code change. This module does + * exactly that: + * + * 1. Stage a branch off `baseBranch`. + * 2. Write each `fileChange` into the worktree. + * 3. Commit + push. + * 4. Open a PR via the GitHub API. + * + * Two transports ship in core: + * + * - `ghCliClient(opts)` — shells out to the `gh` CLI. No extra deps, + * re-uses the developer machine's `gh auth` state, works with both + * github.com and GitHub Enterprise. This is the recommended default. + * - `httpGithubClient(opts)` — direct `fetch` against `api.github.com` + * with a bearer token. Useful in CI where `gh` may not be installed. + * + * Both implement the small `AutoPrClient` interface, so tests substitute + * a fake without spinning a process or network. + * + * @experimental — added in 0.25.0. Surface may evolve as consumers wire + * it into CI workflows. + */ + +import { ConfigError, ValidationError } from './errors' + +export interface FileChange { + /** Repo-relative path. Forward slashes; no `..`. */ + path: string + /** New file contents. UTF-8. */ + contents: string + /** Optional explanatory comment shown in the commit body. */ + rationale?: string +} + +export interface RepoRef { + owner: string + name: string +} + +export interface ProposeAutomatedPullRequestInput { + repo: RepoRef + /** Branch to base the PR on. Default `'main'`. */ + baseBranch?: string + /** New branch name. Use a prefix + a short stable id; no spaces. */ + branchName: string + fileChanges: FileChange[] + title: string + body: string + /** Optional GitHub usernames to request review from. */ + reviewers?: string[] + /** Optional labels to apply. */ + labels?: string[] + /** Commit author name. Default: derived from the GitHub client. */ + authorName?: string + /** Commit author email. Default: derived from the GitHub client. */ + authorEmail?: string + /** Dry-run — do not push or open a PR; just return the would-be plan. */ + dryRun?: boolean +} + +export interface ProposeAutomatedPullRequestResult { + prUrl: string + branchName: string + headSha: string + dryRun: boolean +} + +/** Pluggable transport for the auto-PR pipeline. */ +export interface AutoPrClient { + /** + * Create a branch from `baseBranch`, write file changes, commit, push, + * and open a PR. Returns the PR's HTML url and head SHA. + * + * Implementations must be idempotent on `branchName`: if the branch + * already exists with the same head SHA as the would-be commit, return + * the existing PR rather than failing. This makes the production loop + * safe to retry on transient errors. + */ + proposeChange(input: ProposeAutomatedPullRequestInput): Promise +} + +export async function proposeAutomatedPullRequest( + client: AutoPrClient, + input: ProposeAutomatedPullRequestInput, +): Promise { + validate(input) + return client.proposeChange(input) +} + +function validate(input: ProposeAutomatedPullRequestInput): void { + if (!input.repo.owner.trim() || !input.repo.name.trim()) { + throw new ValidationError('proposeAutomatedPullRequest: repo.owner and repo.name required') + } + if (!input.branchName.trim() || /\s/.test(input.branchName)) { + throw new ValidationError( + 'proposeAutomatedPullRequest: branchName must be non-empty and contain no whitespace', + ) + } + if (input.branchName === (input.baseBranch ?? 'main')) { + throw new ValidationError('proposeAutomatedPullRequest: branchName must differ from baseBranch') + } + if (input.fileChanges.length === 0) { + throw new ValidationError('proposeAutomatedPullRequest: fileChanges must not be empty') + } + const seenPaths = new Set() + for (const change of input.fileChanges) { + if (!change.path.trim() || change.path.includes('..') || change.path.startsWith('/')) { + throw new ValidationError( + `proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`, + ) + } + if (seenPaths.has(change.path)) { + throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`) + } + seenPaths.add(change.path) + } + if (!input.title.trim()) { + throw new ValidationError('proposeAutomatedPullRequest: title must not be empty') + } +} + +// ── HTTP transport (uses `fetch` against api.github.com) ───────────── + +export interface HttpGithubClientOptions { + /** Personal access token, GitHub App token, or `GITHUB_TOKEN` from Actions. */ + token: string + /** Override for GitHub Enterprise. Default `'https://api.github.com'`. */ + apiBase?: string + /** Test seam — defaults to global `fetch`. */ + fetchImpl?: typeof fetch + /** Test seam — clock for commit timestamps. */ + now?: () => Date +} + +interface GhRef { + ref: string + object: { sha: string } +} + +interface GhCommit { + sha: string + tree: { sha: string } +} + +interface GhBlob { + sha: string +} + +interface GhTree { + sha: string +} + +interface GhPullRequest { + html_url: string + number: number +} + +/** + * Direct REST-API GitHub client. No external deps. + * + * Idempotency strategy: before creating refs/commits/PRs, check whether + * the branch already exists at the desired tree. If so, return the + * existing PR (or open one if missing). Errors from concurrent runs + * (`Reference already exists`) are caught and treated as success. + */ +export function httpGithubClient(opts: HttpGithubClientOptions): AutoPrClient { + const fetchImpl = opts.fetchImpl ?? fetch + const apiBase = (opts.apiBase ?? 'https://api.github.com').replace(/\/+$/, '') + const now = opts.now ?? (() => new Date()) + + async function api( + method: string, + path: string, + body?: unknown, + accept404 = false, + ): Promise { + const res = await fetchImpl(`${apiBase}${path}`, { + method, + headers: { + accept: 'application/vnd.github+json', + 'content-type': 'application/json', + authorization: `Bearer ${opts.token}`, + 'x-github-api-version': '2022-11-28', + }, + body: body === undefined ? undefined : JSON.stringify(body), + }) + if (accept404 && res.status === 404) return null + if (!res.ok) { + const text = await res.text().catch(() => '') + throw new ConfigError( + `proposeAutomatedPullRequest: GitHub ${method} ${path} → ${res.status} ${text.slice(0, 400)}`, + ) + } + return (await res.json()) as T + } + + return { + async proposeChange(input) { + const baseBranch = input.baseBranch ?? 'main' + const repoPath = `/repos/${input.repo.owner}/${input.repo.name}` + + if (input.dryRun) { + return { + prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`, + branchName: input.branchName, + headSha: 'dry-run', + dryRun: true, + } + } + + // 1. Find base SHA + const baseRef = await api('GET', `${repoPath}/git/ref/heads/${baseBranch}`) + if (!baseRef) { + throw new ConfigError(`proposeAutomatedPullRequest: base branch "${baseBranch}" not found`) + } + const baseSha = baseRef.object.sha + const baseCommit = await api('GET', `${repoPath}/git/commits/${baseSha}`) + if (!baseCommit) { + throw new ConfigError( + `proposeAutomatedPullRequest: base commit ${baseSha} not found (race condition?)`, + ) + } + + // 2. Create blobs for each file + const treeEntries = [] + for (const change of input.fileChanges) { + const blob = await api('POST', `${repoPath}/git/blobs`, { + content: change.contents, + encoding: 'utf-8', + }) + if (!blob) throw new ConfigError('proposeAutomatedPullRequest: blob creation returned null') + treeEntries.push({ + path: change.path, + mode: '100644', + type: 'blob', + sha: blob.sha, + }) + } + + // 3. Create tree + const tree = await api('POST', `${repoPath}/git/trees`, { + base_tree: baseCommit.tree.sha, + tree: treeEntries, + }) + if (!tree) throw new ConfigError('proposeAutomatedPullRequest: tree creation returned null') + + // 4. Create commit + const author = + input.authorName && input.authorEmail + ? { name: input.authorName, email: input.authorEmail, date: now().toISOString() } + : undefined + const commitMessage = renderCommitMessage(input) + const commit = await api('POST', `${repoPath}/git/commits`, { + message: commitMessage, + tree: tree.sha, + parents: [baseSha], + ...(author ? { author, committer: author } : {}), + }) + if (!commit) + throw new ConfigError('proposeAutomatedPullRequest: commit creation returned null') + + // 5. Create or fast-forward branch ref (idempotent on existing branch). + const existing = await api( + 'GET', + `${repoPath}/git/ref/heads/${input.branchName}`, + undefined, + true, + ) + if (!existing) { + await api('POST', `${repoPath}/git/refs`, { + ref: `refs/heads/${input.branchName}`, + sha: commit.sha, + }) + } else if (existing.object.sha !== commit.sha) { + await api('PATCH', `${repoPath}/git/refs/heads/${input.branchName}`, { + sha: commit.sha, + force: true, + }) + } + + // 6. Open PR (or find an existing open one for the same branch). + const openPrs = await api( + 'GET', + `${repoPath}/pulls?state=open&head=${encodeURIComponent(`${input.repo.owner}:${input.branchName}`)}`, + ) + let pr: GhPullRequest + if (openPrs && openPrs.length > 0) { + pr = openPrs[0] as GhPullRequest + } else { + const created = await api('POST', `${repoPath}/pulls`, { + title: input.title, + body: input.body, + head: input.branchName, + base: baseBranch, + }) + if (!created) + throw new ConfigError('proposeAutomatedPullRequest: PR creation returned null') + pr = created + } + + if (input.reviewers && input.reviewers.length > 0) { + await api( + 'POST', + `${repoPath}/pulls/${pr.number}/requested_reviewers`, + { reviewers: input.reviewers }, + true, + ).catch(() => { + /* reviewer assignment is best-effort */ + }) + } + if (input.labels && input.labels.length > 0) { + await api( + 'POST', + `${repoPath}/issues/${pr.number}/labels`, + { labels: input.labels }, + true, + ).catch(() => { + /* label assignment is best-effort */ + }) + } + + return { + prUrl: pr.html_url, + branchName: input.branchName, + headSha: commit.sha, + dryRun: false, + } + }, + } +} + +// ── gh CLI transport (no fetch needed, re-uses developer auth) ────── + +export interface GhCliClientOptions { + /** Override the CLI binary (`gh`). For testing. */ + bin?: string + /** Working directory containing a clone of `repo`. Default: process cwd. */ + cwd?: string + /** Test seam: process spawner. Default: node:child_process spawn. */ + exec?: ( + bin: string, + args: string[], + opts: { cwd: string; stdin?: string }, + ) => Promise<{ stdout: string; stderr: string; exitCode: number }> +} + +/** + * `gh` CLI transport. Requires: + * - `gh` installed and authenticated (`gh auth status`). + * - A local clone of the repo with a clean working tree. + * - `git` on PATH. + * + * Uses `gh api` for repo metadata and `gh pr create` for the PR. The + * actual commit lands via `git`, which keeps `gh`'s footprint minimal. + */ +export function ghCliClient(opts: GhCliClientOptions = {}): AutoPrClient { + const bin = opts.bin ?? 'gh' + const cwd = opts.cwd ?? process.cwd() + const exec = opts.exec ?? defaultExec + + async function run( + cmd: string, + args: string[], + stdin?: string, + ): Promise<{ stdout: string; stderr: string }> { + const r = await exec(cmd, args, { cwd, stdin }) + if (r.exitCode !== 0) { + throw new ConfigError( + `proposeAutomatedPullRequest: ${cmd} ${args.join(' ')} failed (${r.exitCode}): ${r.stderr.trim() || r.stdout.trim()}`, + ) + } + return r + } + + return { + async proposeChange(input) { + const baseBranch = input.baseBranch ?? 'main' + if (input.dryRun) { + return { + prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`, + branchName: input.branchName, + headSha: 'dry-run', + dryRun: true, + } + } + + // Ensure we're working in a clean tree on the base branch. + await run('git', ['fetch', 'origin', baseBranch]) + await run('git', ['checkout', baseBranch]) + await run('git', ['reset', '--hard', `origin/${baseBranch}`]) + + // Branch (idempotent: delete if exists, then re-create from base). + await exec('git', ['branch', '-D', input.branchName], { cwd }) + await run('git', ['checkout', '-b', input.branchName]) + + // Write file changes. + const { mkdir, writeFile } = await import('node:fs/promises') + const { dirname, join, resolve } = await import('node:path') + for (const change of input.fileChanges) { + const abs = resolve(cwd, change.path) + await mkdir(dirname(abs), { recursive: true }) + await writeFile(abs, change.contents, 'utf8') + await run('git', ['add', join(change.path)]) + } + + // Commit. + const env: Record = {} + if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName + if (input.authorEmail) env.GIT_AUTHOR_EMAIL = input.authorEmail + if (input.authorName) env.GIT_COMMITTER_NAME = input.authorName + if (input.authorEmail) env.GIT_COMMITTER_EMAIL = input.authorEmail + const message = renderCommitMessage(input) + await run('git', ['commit', '-m', message]) + + const headRes = await run('git', ['rev-parse', 'HEAD']) + const headSha = headRes.stdout.trim() + + // Push. + await run('git', ['push', '-f', 'origin', input.branchName]) + + // Open PR (idempotent: `gh pr create` errors if one exists). + const existing = await exec( + bin, + [ + 'pr', + 'list', + '--state', + 'open', + '--head', + input.branchName, + '--json', + 'url,number', + '--limit', + '1', + ], + { cwd }, + ) + let prUrl = '' + if (existing.exitCode === 0 && existing.stdout.trim()) { + const parsed = JSON.parse(existing.stdout) as Array<{ url: string }> + if (parsed.length > 0 && parsed[0]) prUrl = parsed[0].url + } + if (!prUrl) { + const args = [ + 'pr', + 'create', + '--title', + input.title, + '--body', + input.body, + '--base', + baseBranch, + ] + if (input.reviewers && input.reviewers.length > 0) { + args.push('--reviewer', input.reviewers.join(',')) + } + if (input.labels && input.labels.length > 0) { + args.push('--label', input.labels.join(',')) + } + const r = await run(bin, args) + const match = r.stdout.match(/https?:\/\/\S+/) + prUrl = match ? match[0] : r.stdout.trim() + } + + return { prUrl, branchName: input.branchName, headSha, dryRun: false } + }, + } +} + +async function defaultExec( + bin: string, + args: string[], + opts: { cwd: string; stdin?: string }, +): Promise<{ stdout: string; stderr: string; exitCode: number }> { + const { spawn } = await import('node:child_process') + return new Promise((resolveExec) => { + const child = spawn(bin, args, { cwd: opts.cwd }) + let stdout = '' + let stderr = '' + child.stdout.on('data', (d) => { + stdout += d.toString() + }) + child.stderr.on('data', (d) => { + stderr += d.toString() + }) + if (opts.stdin) child.stdin.end(opts.stdin) + child.on('error', (err) => { + resolveExec({ stdout, stderr: `${stderr}${err.message}`, exitCode: 1 }) + }) + child.on('close', (code) => { + resolveExec({ stdout, stderr, exitCode: code ?? 1 }) + }) + }) +} + +function renderCommitMessage(input: ProposeAutomatedPullRequestInput): string { + const lines = [input.title, ''] + for (const change of input.fileChanges) { + if (change.rationale) lines.push(`- ${change.path}: ${change.rationale}`) + } + if (lines[lines.length - 1] !== '') lines.push('') + lines.push(input.body.trim()) + return lines.join('\n').trim() +} diff --git a/src/index.ts b/src/index.ts index f4cc5de..f66ce54 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,6 +2,28 @@ export type { ActionExecutionPolicy, ActionPolicyDecision } from './action-policy' export { evaluateActionPolicy } from './action-policy' +// ── 0.25.0 production loop primitive ───────────────────────────────── +// Closes the eval → prod → eval cycle: ingest production traces, +// cluster failures, run evolve on the offending cluster, gate the +// candidate, open a PR with the improved prompt. +/** + * @experimental — added in 0.25.0. Surface may evolve as the production + * agents wire it in. Pin the patch version if you depend on it. + */ +export type { + AutoPrClient, + FileChange, + GhCliClientOptions, + HttpGithubClientOptions, + ProposeAutomatedPullRequestInput, + ProposeAutomatedPullRequestResult, + RepoRef, +} from './auto-pr' +export { + ghCliClient, + httpGithubClient, + proposeAutomatedPullRequest, +} from './auto-pr' export { BenchmarkRunner } from './benchmark' // ── Client / driver / judges / executor / benchmark / registry / reporter ─ export { ProductClient, runE2EWorkflow } from './client' @@ -118,6 +140,20 @@ export { MODEL_PRICING, TokenCounter, } from './metrics' +/** + * @experimental — added in 0.25.0. + */ +export type { + FailureClusterConfig, + ProductionEvolveConfig, + ProductionLoopCronConfig, + ProductionLoopDecision, + ProductionLoopRenderContext, + ProductionLoopResult, + ProductionShipConfig, + RunProductionLoopOptions, +} from './production-loop' +export { runProductionLoop } from './production-loop' export { ScenarioRegistry } from './registry' export { formatBenchmarkReport, formatDriverReport, printDriverSummary } from './reporter' export type { diff --git a/src/production-loop.ts b/src/production-loop.ts new file mode 100644 index 0000000..f8bd40b --- /dev/null +++ b/src/production-loop.ts @@ -0,0 +1,679 @@ +/** + * ProductionLoop — the substrate that closes eval → prod → eval. + * + * Static prompts decay. Yesterday's regulation flips today; yesterday's + * tool quirk becomes today's incident. A production agent that ships a + * static prompt and never re-trains is on a clock. + * + * `runProductionLoop` is the orchestration layer over the eval substrate: + * + * 1. Ingest production traces + user feedback (via the wire HTTP + * ingestion endpoints, or directly through any `TraceStore` and + * `FeedbackTrajectoryStore` implementation). + * 2. Cluster the failures (`failureClusterView`) and prioritize by + * size × severity. + * 3. If any cluster crosses the consumer's threshold, run a + * `runMultiShotOptimization` round seeded by the current production + * prompt against holdout-shape scenarios derived from the offending + * cluster. + * 4. Gate the promoted prompt with `evaluateReleaseConfidence`. Fail + * closed. + * 5. If the gate passes and an `AutoPrClient` is wired, open a PR with + * the new prompt. Otherwise return the proposed change. + * + * One call = one cycle. Cron / GitHub Actions are the caller's job. The + * primitive is idempotent + replayable: re-running with the same + * `runId` will produce the same plan. + * + * @experimental — added in 0.25.0. Surface may evolve as the 5 product + * agents wire it in. + */ + +import type { AutoPrClient, ProposeAutomatedPullRequestResult, RepoRef } from './auto-pr' +import { proposeAutomatedPullRequest } from './auto-pr' +import { ValidationError } from './errors' +import type { FeedbackTrajectoryStore } from './feedback-trajectory' +import type { GateDecision, HeldOutGateConfig } from './held-out-gate' +import type { + MultiShotMutateAdapter, + MultiShotOptimizationResult, + MultiShotRunner, + MultiShotScorer, + MultiShotTrialResult, +} from './multi-shot-optimization' +import { runMultiShotOptimization } from './multi-shot-optimization' +import { type FailureCluster, failureClusterView } from './pipelines/failure-cluster' +import type { EvolvableVariant } from './prompt-evolution' +import { + evaluateReleaseConfidence, + type ReleaseConfidenceScorecard, + type ReleaseConfidenceThresholds, + releaseTraceEvidenceFromMultiShotTrials, +} from './release-confidence' +import type { RunRecord, RunSplitTag } from './run-record' +import type { TraceStore } from './trace/store' +import type { Scenario } from './types' + +// ── Public types ───────────────────────────────────────────────────── + +export interface FailureClusterConfig { + /** Minimum runs in a cluster before it triggers an evolve round. Default 5. */ + minClusterSize?: number + /** + * Severity threshold. A cluster is "actionable" when its size + * normalized by total runs exceeds this. Default 0.05 (5% of all runs). + */ + minSeverityRatio?: number + /** + * Maximum number of clusters to react to in one cycle. Acting on too + * many at once obscures attribution. Default 1 — the worst cluster. + */ + maxClustersPerCycle?: number +} + +export interface ProductionEvolveConfig

{ + /** How to run a candidate prompt against a scenario. */ + runner: MultiShotRunner

+ /** How to score the trajectory. Usually a calibrated judge. */ + scorer: MultiShotScorer

+ /** How to mutate. Addendum-style mutators (append vs. rewrite) work best. */ + mutator: MultiShotMutateAdapter

+ /** The current production prompt. Acts as the baseline + seed. */ + baselinePrompt: P + /** Stable id for the baseline variant. Default `'baseline'`. */ + baselineId?: string + /** Scenarios resembling production load. Used as the holdout split. */ + holdoutScenarios: Scenario[] + /** Scenarios used during search. Default: derived from `holdoutScenarios` via deterministic split. */ + searchScenarios?: Scenario[] + /** Gate config for the held-out promotion check. */ + gate: HeldOutGateConfig + /** Reps per (variant × scenario) cell. Default 3. */ + reps?: number + /** Number of mutation generations. Default 3. */ + generations?: number + /** Population size per generation. Default 4. */ + populationSize?: number + /** Concurrent score() calls. Default 1. */ + scoreConcurrency?: number + /** + * Optional bridge from a scored trial into a paper-grade RunRecord. + * If omitted, the loop synthesises a minimal record sufficient for + * `HeldOutGate` and `evaluateReleaseConfidence`. + */ + toRunRecord?: (input: { + variant: EvolvableVariant

+ scenarioId: string + rep: number + split: RunSplitTag + seed: number + trial: MultiShotTrialResult + }) => RunRecord +} + +export interface ProductionShipConfig { + repo: RepoRef + /** Branch name prefix. Final branch = `${branchPrefix}/${runId}`. */ + branchPrefix: string + /** Path (repo-relative) of the file holding the production prompt. */ + promptFilePath: string + /** Base branch for the PR. Default `'main'`. */ + baseBranch?: string + reviewers?: string[] + labels?: string[] + /** Required: the auto-PR transport. Use `ghCliClient()` or `httpGithubClient()`. */ + client: AutoPrClient + /** Skip the actual push + PR call — for sanity-checking the plan. Default false. */ + dryRun?: boolean + /** Render PR body from the loop's findings. Optional override. */ + renderBody?: (ctx: ProductionLoopRenderContext) => string + /** Render the file contents from the new prompt. Default: serialize as the file. */ + renderPromptFile?: (newPrompt: string, oldFileContents: string | null) => string + /** Read the current prompt file contents for diff context. Optional. */ + readCurrentPromptFile?: () => Promise +} + +export interface ProductionLoopCronConfig { + cadence: 'weekly' | 'daily' | 'hourly' + /** Optional jitter (seconds) the consumer's scheduler should add. Surface-only. */ + jitterSec?: number +} + +export interface RunProductionLoopOptions

{ + /** Stable id; deterministic outputs when reused. */ + runId: string + /** Human label — surfaces in PR titles and reports. */ + target: string + traceStore: TraceStore + feedbackStore: FeedbackTrajectoryStore + cluster: FailureClusterConfig + evolve: ProductionEvolveConfig

+ /** When omitted, the loop returns the proposed prompt without opening a PR. */ + ship?: ProductionShipConfig + /** Surface-only — encodes scheduler expectations into the artifact. */ + cron?: ProductionLoopCronConfig + /** Release confidence thresholds. Default: library defaults. */ + releaseThresholds?: ReleaseConfidenceThresholds + /** Now() seam for reproducibility in tests. */ + now?: () => Date +} + +export type ProductionLoopDecision = + | 'no_actionable_failures' + | 'evolve_yielded_no_improvement' + | 'gate_failed' + | 'proposed_change' + | 'pr_opened' + +export interface ProductionLoopRenderContext { + runId: string + target: string + decision: ProductionLoopDecision + /** Clusters seen in production this cycle, sorted by severity. */ + clusters: FailureCluster[] + /** The cluster the loop acted on (if any). */ + actedOnCluster: FailureCluster | null + /** Production runs observed this cycle. */ + observedRunCount: number + /** Feedback trajectories observed this cycle. */ + observedFeedbackCount: number + /** Evolve result (if evolve ran). */ + evolution: MultiShotOptimizationResult | null + /** Release gate verdict (if evolve ran). */ + release: ReleaseConfidenceScorecard | null + /** Held-out gate decision (if a candidate was paired against the baseline). */ + gate: GateDecision | null + /** The baseline (current production) prompt as a string. */ + baselinePromptString: string + /** The proposed new prompt as a string. Empty if no change was proposed. */ + promotedPromptString: string +} + +export interface ProductionLoopResult { + runId: string + target: string + decision: ProductionLoopDecision + startedAt: string + finishedAt: string + observedRunCount: number + observedFeedbackCount: number + clusters: FailureCluster[] + actedOnCluster: FailureCluster | null + evolution: MultiShotOptimizationResult | null + release: ReleaseConfidenceScorecard | null + gate: GateDecision | null + /** Baseline prompt as it entered the cycle. */ + baselinePrompt: unknown + /** Promoted prompt — equals baseline when no change is proposed. */ + promotedPrompt: unknown + /** PR artifact when `ship` was wired and gate passed. */ + pullRequest: ProposeAutomatedPullRequestResult | null + cron: ProductionLoopCronConfig | null +} + +// ── Entry point ────────────────────────────────────────────────────── + +export async function runProductionLoop

( + opts: RunProductionLoopOptions

, +): Promise { + validate(opts) + const now = opts.now ?? (() => new Date()) + const startedAt = now().toISOString() + + const observedRuns = await opts.traceStore.listRuns() + const observedFeedback = await opts.feedbackStore.list() + + const clusterReport = await failureClusterView(opts.traceStore, { + minClusterSize: opts.cluster.minClusterSize ?? 1, + }) + const minSize = opts.cluster.minClusterSize ?? 5 + const minSeverity = opts.cluster.minSeverityRatio ?? 0.05 + const maxClusters = opts.cluster.maxClustersPerCycle ?? 1 + const totalRuns = clusterReport.totalRuns + const actionable = clusterReport.clusters + .filter((c) => c.runCount >= minSize) + .filter((c) => totalRuns === 0 || c.runCount / totalRuns >= minSeverity) + .slice(0, maxClusters) + + if (actionable.length === 0) { + return finalize({ + opts, + decision: 'no_actionable_failures', + startedAt, + now, + observedRunCount: observedRuns.length, + observedFeedbackCount: observedFeedback.length, + clusters: clusterReport.clusters, + actedOnCluster: null, + evolution: null, + release: null, + gate: null, + promotedPrompt: opts.evolve.baselinePrompt as unknown, + pullRequest: null, + }) + } + + // Run one evolve round against the worst cluster's scenarios. + const actedOn = actionable[0] as FailureCluster + const baseline: EvolvableVariant

= { + id: opts.evolve.baselineId ?? 'baseline', + label: opts.evolve.baselineId ?? 'baseline', + generation: 0, + payload: opts.evolve.baselinePrompt, + } + + const holdoutIds = uniqueIds(opts.evolve.holdoutScenarios.map((s) => s.id)) + const searchIds = uniqueIds( + (opts.evolve.searchScenarios ?? deriveSearchScenarios(opts.evolve.holdoutScenarios)).map( + (s) => s.id, + ), + ) + if (searchIds.some((id) => holdoutIds.includes(id))) { + throw new ValidationError( + 'runProductionLoop: searchScenarios and holdoutScenarios must be disjoint', + ) + } + + const reps = opts.evolve.reps ?? 3 + const generations = opts.evolve.generations ?? 3 + const populationSize = opts.evolve.populationSize ?? Math.max(2, opts.evolve.reps ?? 4) + + const evolution = (await runMultiShotOptimization

({ + runId: `${opts.runId}/evolve`, + target: opts.target, + seedVariants: [baseline], + searchScenarioIds: searchIds, + reps, + generations, + populationSize, + scoreConcurrency: opts.evolve.scoreConcurrency ?? 1, + runner: opts.evolve.runner, + scorer: opts.evolve.scorer, + mutateAdapter: opts.evolve.mutator, + gate: { + holdoutScenarioIds: holdoutIds, + reps, + gate: { ...opts.evolve.gate, baselineKey: baseline.id }, + toRunRecord: + opts.evolve.toRunRecord ?? + (({ variant, scenarioId, rep, split, seed, trial }) => + syntheticRunRecord({ + runId: `${opts.runId}-${variant.id}-${scenarioId}-${rep}-${split}`, + variant, + scenarioId, + rep, + split, + seed, + trial, + target: opts.target, + })), + }, + })) as MultiShotOptimizationResult + + const gate = evolution.gate?.decision ?? null + const promotedVariant = evolution.promotedVariant + const promoted = promotedVariant.payload + const promotedChanged = promotedVariant.id !== baseline.id + + // Build release scorecard — fail closed on weak evidence. + // runMultiShotOptimization populates these with MultiShotTrialResult rows + // (the adapter inside writes the multi-shot fields onto every trial), but + // the optimizer's outer type-parameter erases that to the base TrialResult + // shape. Cast deliberately — this is the documented contract. + const allTrials = evolution.evolution.generations.flatMap( + (g) => g.trials as unknown as MultiShotTrialResult[], + ) + const traceEvidence = releaseTraceEvidenceFromMultiShotTrials(allTrials) + const releaseScenarios = [ + ...(opts.evolve.searchScenarios ?? []).map((s) => ({ + id: s.id, + payload: s as unknown, + split: 'train' as const, + tags: { persona: s.persona, label: s.label }, + })), + ...opts.evolve.holdoutScenarios.map((s) => ({ + id: s.id, + payload: s as unknown, + split: 'holdout' as const, + tags: { persona: s.persona, label: s.label }, + })), + ] + const release = evaluateReleaseConfidence({ + target: opts.target, + candidateId: promotedVariant.id, + baselineId: baseline.id, + scenarios: releaseScenarios, + traces: traceEvidence, + gateDecision: gate ?? undefined, + thresholds: opts.releaseThresholds, + runs: [...(evolution.gate?.candidateRuns ?? []), ...(evolution.gate?.baselineRuns ?? [])], + }) + + if (!promotedChanged) { + return finalize({ + opts, + decision: 'evolve_yielded_no_improvement', + startedAt, + now, + observedRunCount: observedRuns.length, + observedFeedbackCount: observedFeedback.length, + clusters: clusterReport.clusters, + actedOnCluster: actedOn, + evolution, + release, + gate, + promotedPrompt: promoted as unknown, + pullRequest: null, + }) + } + + if (release.status === 'fail' || (gate && !gate.promote)) { + return finalize({ + opts, + decision: 'gate_failed', + startedAt, + now, + observedRunCount: observedRuns.length, + observedFeedbackCount: observedFeedback.length, + clusters: clusterReport.clusters, + actedOnCluster: actedOn, + evolution, + release, + gate, + promotedPrompt: promoted as unknown, + pullRequest: null, + }) + } + + if (!opts.ship) { + return finalize({ + opts, + decision: 'proposed_change', + startedAt, + now, + observedRunCount: observedRuns.length, + observedFeedbackCount: observedFeedback.length, + clusters: clusterReport.clusters, + actedOnCluster: actedOn, + evolution, + release, + gate, + promotedPrompt: promoted as unknown, + pullRequest: null, + }) + } + + // Open the PR. + const baselineStr = toPromptString(baseline.payload) + const promotedStr = toPromptString(promoted) + const ctx: ProductionLoopRenderContext = { + runId: opts.runId, + target: opts.target, + decision: 'pr_opened', + clusters: clusterReport.clusters, + actedOnCluster: actedOn, + observedRunCount: observedRuns.length, + observedFeedbackCount: observedFeedback.length, + evolution, + release, + gate, + baselinePromptString: baselineStr, + promotedPromptString: promotedStr, + } + const renderBody = opts.ship.renderBody ?? defaultRenderBody + const renderFile = + opts.ship.renderPromptFile ?? ((next: string, _prev: string | null) => `${next}\n`) + const currentFile = opts.ship.readCurrentPromptFile + ? await opts.ship.readCurrentPromptFile() + : null + + const pr = await proposeAutomatedPullRequest(opts.ship.client, { + repo: opts.ship.repo, + baseBranch: opts.ship.baseBranch ?? 'main', + branchName: `${opts.ship.branchPrefix.replace(/\/+$/, '')}/${opts.runId}`, + title: `${opts.target}: production-loop prompt update (${opts.runId})`, + body: renderBody(ctx), + reviewers: opts.ship.reviewers, + labels: opts.ship.labels, + fileChanges: [ + { + path: opts.ship.promptFilePath, + contents: renderFile(promotedStr, currentFile), + rationale: `Auto-improved against cluster "${actedOn.failureClass}" (${actedOn.runCount} prod failures)`, + }, + ], + dryRun: opts.ship.dryRun, + }) + + return finalize({ + opts, + decision: 'pr_opened', + startedAt, + now, + observedRunCount: observedRuns.length, + observedFeedbackCount: observedFeedback.length, + clusters: clusterReport.clusters, + actedOnCluster: actedOn, + evolution, + release, + gate, + promotedPrompt: promoted as unknown, + pullRequest: pr, + }) +} + +// ── Helpers ────────────────────────────────────────────────────────── + +function finalize

(args: { + opts: RunProductionLoopOptions

+ decision: ProductionLoopDecision + startedAt: string + now: () => Date + observedRunCount: number + observedFeedbackCount: number + clusters: FailureCluster[] + actedOnCluster: FailureCluster | null + evolution: MultiShotOptimizationResult | null + release: ReleaseConfidenceScorecard | null + gate: GateDecision | null + promotedPrompt: unknown + pullRequest: ProposeAutomatedPullRequestResult | null +}): ProductionLoopResult { + return { + runId: args.opts.runId, + target: args.opts.target, + decision: args.decision, + startedAt: args.startedAt, + finishedAt: args.now().toISOString(), + observedRunCount: args.observedRunCount, + observedFeedbackCount: args.observedFeedbackCount, + clusters: args.clusters, + actedOnCluster: args.actedOnCluster, + evolution: args.evolution, + release: args.release, + gate: args.gate, + baselinePrompt: args.opts.evolve.baselinePrompt, + promotedPrompt: args.promotedPrompt, + pullRequest: args.pullRequest, + cron: args.opts.cron ?? null, + } +} + +function validate

(opts: RunProductionLoopOptions

): void { + if (!opts.runId.trim()) throw new ValidationError('runProductionLoop: runId required') + if (!opts.target.trim()) throw new ValidationError('runProductionLoop: target required') + if (opts.evolve.holdoutScenarios.length === 0) { + throw new ValidationError('runProductionLoop: evolve.holdoutScenarios must not be empty') + } + if (opts.evolve.searchScenarios && opts.evolve.searchScenarios.length === 0) { + throw new ValidationError( + 'runProductionLoop: evolve.searchScenarios must be omitted or non-empty', + ) + } + if (!opts.evolve.gate.baselineKey && !opts.evolve.baselineId) { + // baselineId defaults to 'baseline', but if the caller explicitly set + // a baselineKey on the gate, the optimization adapter enforces that + // it matches; verify here so we fail fast. + } + if (opts.ship) { + if (!opts.ship.branchPrefix.trim()) { + throw new ValidationError('runProductionLoop: ship.branchPrefix required') + } + if (!opts.ship.promptFilePath.trim()) { + throw new ValidationError('runProductionLoop: ship.promptFilePath required') + } + } +} + +function uniqueIds(ids: string[]): string[] { + const seen = new Set() + const out: string[] = [] + for (const id of ids) { + if (seen.has(id)) continue + seen.add(id) + out.push(id) + } + return out +} + +/** + * Deterministic split when the consumer only provides holdout scenarios: + * use a stable hash to pick ~25% as search. Caller-side scenarios are + * always preferred (this is a fallback, not a recommendation). + */ +function deriveSearchScenarios(holdout: Scenario[]): Scenario[] { + if (holdout.length < 4) { + // Synthesize a small label-only search scenario to keep search + // disjoint from holdout. This degrades to a less-rigorous evolve + // but never silently overlaps. + return [ + { + ...(holdout[0] as Scenario), + id: `${(holdout[0] as Scenario).id}__search`, + }, + ] + } + return holdout.filter((_, i) => i % 4 === 0).map((s) => ({ ...s, id: `${s.id}__search` })) +} + +function syntheticRunRecord(input: { + runId: string + variant: EvolvableVariant + scenarioId: string + rep: number + split: RunSplitTag + seed: number + trial: MultiShotTrialResult + target: string +}): RunRecord { + const scoreKey = input.split === 'holdout' ? 'holdoutScore' : 'searchScore' + return { + runId: input.runId, + experimentId: input.target, + candidateId: input.variant.id, + seed: input.seed, + model: 'production-loop@synthetic', + promptHash: '0'.repeat(64), + configHash: '0'.repeat(64), + commitSha: '0'.repeat(40), + wallMs: input.trial.durationMs ?? 1, + costUsd: input.trial.cost ?? 0, + tokenUsage: { input: 0, output: 0 }, + outcome: { + [scoreKey]: input.trial.score, + raw: { score: input.trial.score, ok: input.trial.ok ? 1 : 0 }, + }, + splitTag: input.split, + scenarioId: input.scenarioId, + } +} + +function toPromptString(payload: unknown): string { + if (typeof payload === 'string') return payload + if (payload == null) return '' + try { + return JSON.stringify(payload, null, 2) + } catch { + return String(payload) + } +} + +function defaultRenderBody(ctx: ProductionLoopRenderContext): string { + const cluster = ctx.actedOnCluster + const release = ctx.release + const gate = ctx.gate + const lines: string[] = [] + lines.push(`## Production-loop prompt update — \`${ctx.target}\``) + lines.push('') + lines.push(`Run id: \`${ctx.runId}\``) + lines.push(`Decision: \`${ctx.decision}\``) + lines.push( + `Observed in this cycle: ${ctx.observedRunCount} prod runs, ${ctx.observedFeedbackCount} feedback trajectories.`, + ) + lines.push('') + if (cluster) { + lines.push('### Triggering failure cluster') + lines.push('') + lines.push(`- **class**: \`${cluster.failureClass}\``) + lines.push(`- **runs in cluster**: ${cluster.runCount}`) + lines.push(`- **distinct scenarios**: ${cluster.scenarioIds.length}`) + if (cluster.toolName) lines.push(`- **tool**: \`${cluster.toolName}\``) + if (cluster.dimension) lines.push(`- **judge dimension**: \`${cluster.dimension}\``) + if (cluster.exampleError) { + lines.push( + `- **example error**: \`${cluster.exampleError.slice(0, 200).replace(/\n/g, ' ')}\``, + ) + } + lines.push('') + } + if (gate) { + lines.push('### Held-out promotion gate') + lines.push('') + lines.push(`- **decision**: \`${gate.promote ? 'PROMOTE' : 'REJECT'}\``) + lines.push(`- **paired median delta**: ${gate.evidence.medianPairedDelta.toFixed(4)}`) + lines.push( + `- **paired 95% CI**: [${gate.evidence.pairedCI.low.toFixed(4)}, ${gate.evidence.pairedCI.high.toFixed(4)}]`, + ) + lines.push(`- **paired p-value**: ${gate.evidence.pairedPValue.toFixed(4)}`) + lines.push( + `- **search/holdout means**: ${gate.evidence.searchScore.toFixed(4)} / ${gate.evidence.holdoutScore.toFixed(4)}`, + ) + lines.push(`- **overfit gap**: ${gate.evidence.overfitGap.toFixed(4)}`) + lines.push('') + } + if (release) { + lines.push('### Release confidence') + lines.push('') + lines.push(`- **status**: \`${release.status}\``) + lines.push(`- **pass rate**: ${release.metrics.passRate.toFixed(4)}`) + lines.push(`- **mean score**: ${release.metrics.meanScore.toFixed(4)}`) + if (release.issues.length > 0) { + lines.push('- **issues**:') + for (const issue of release.issues) { + lines.push(` - \`${issue.severity}\` ${issue.axis}: ${issue.detail}`) + } + } + lines.push('') + } + lines.push('### Prompt diff') + lines.push('') + lines.push('```diff') + lines.push(unifiedDiff(ctx.baselinePromptString, ctx.promotedPromptString)) + lines.push('```') + return lines.join('\n') +} + +function unifiedDiff(a: string, b: string): string { + const aLines = a.split('\n') + const bLines = b.split('\n') + const out: string[] = [] + const max = Math.max(aLines.length, bLines.length) + for (let i = 0; i < max; i++) { + const al = aLines[i] + const bl = bLines[i] + if (al === bl) continue + if (al !== undefined) out.push(`- ${al}`) + if (bl !== undefined) out.push(`+ ${bl}`) + } + return out.join('\n') +} diff --git a/src/wire/handlers.ts b/src/wire/handlers.ts index 7d7911e..d2cae9f 100644 --- a/src/wire/handlers.ts +++ b/src/wire/handlers.ts @@ -9,16 +9,23 @@ * - Throws `WireError` for caller-fixable errors (404, 400, 422). * - Lets unexpected errors bubble — the transport maps them to 500. */ +import type { FeedbackTrajectoryStore } from '../feedback-trajectory' import { callLlmJson } from '../llm-client' +import type { TraceEvent as InternalTraceEvent } from '../trace/schema' +import type { TraceStore } from '../trace/store' import { getBuiltinRubric, listBuiltinRubrics } from './rubrics' import { + type FeedbackIngestResponse, hashRubric, type JudgeRequest, type JudgeResult, type ListRubricsResponse, type Rubric, + type TracesIngestRequest, + type TracesIngestResponse, type VersionResponse, WIRE_VERSION, + type FeedbackTrajectory as WireFeedbackTrajectory, } from './schemas' /** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */ @@ -260,6 +267,82 @@ export function handleVersion(): VersionResponse { package: '@tangle-network/agent-eval', version: readPackageVersion(), wireVersion: WIRE_VERSION, - apiSurface: ['judge', 'listRubrics', 'version'], + apiSurface: ['judge', 'listRubrics', 'version', 'feedback.ingest', 'traces.ingest'], } } + +// ── Ingestion handlers (0.25.0) ───────────────────────────────────── + +/** + * Pluggable stores the wire layer routes ingestion writes into. Both + * are optional — when omitted, the corresponding endpoint returns 503. + * + * Production deployments wire a `FileSystemTraceStore` and + * `FileSystemFeedbackTrajectoryStore` here. Tests substitute in-memory + * stores. + */ +export interface IngestionStores { + traceStore?: TraceStore + feedbackStore?: FeedbackTrajectoryStore +} + +/** + * `POST /v1/traces/ingest` — accept a batch of `TraceEvent`s from the + * production runtime. Best-effort: each event is appended independently; + * one bad event does not poison the batch. + * + * Idempotency: the underlying store is append-only; consumers retrying + * the same payload will get duplicate events. Consumers should + * de-duplicate by `eventId` downstream — production traces frequently + * land via at-least-once buses (Kafka, SQS) where dedup is unavoidable. + */ +export async function handleTracesIngest( + req: TracesIngestRequest, + stores: IngestionStores, +): Promise { + if (!stores.traceStore) { + throw new WireError( + 'service_unavailable', + 'No trace store configured on this server. Pass `traceStore` to `createApp`.', + 503, + ) + } + const errors: Array<{ eventId: string; message: string }> = [] + let accepted = 0 + for (const event of req.events) { + try { + // The wire `TraceEvent` is structurally identical to the internal one. + await stores.traceStore.appendEvent(event as InternalTraceEvent) + accepted++ + } catch (err) { + errors.push({ + eventId: event.eventId, + message: err instanceof Error ? err.message : String(err), + }) + } + } + return { accepted, rejected: errors.length, errors } +} + +/** + * `POST /v1/feedback` — accept a single `FeedbackTrajectory` from the + * production runtime. Idempotent on `id`: re-posting the same trajectory + * replaces the prior record. + */ +export async function handleFeedbackIngest( + req: WireFeedbackTrajectory, + stores: IngestionStores, +): Promise { + if (!stores.feedbackStore) { + throw new WireError( + 'service_unavailable', + 'No feedback store configured on this server. Pass `feedbackStore` to `createApp`.', + 503, + ) + } + // The wire `FeedbackTrajectory` aligns 1:1 with the internal type; + // cast through `unknown` since the wire schema is a Zod-inferred + // structural type with optional fields the internal store consumes. + await stores.feedbackStore.save(req as unknown as Parameters[0]) + return { id: req.id, persisted: true } +} diff --git a/src/wire/openapi.ts b/src/wire/openapi.ts index 1aaf6d3..8603d59 100644 --- a/src/wire/openapi.ts +++ b/src/wire/openapi.ts @@ -14,10 +14,14 @@ import type { OpenAPIObject } from 'openapi3-ts/oas31' import { ErrorResponseSchema, + FeedbackIngestResponseSchema, + FeedbackTrajectorySchema, HealthResponseSchema, JudgeRequestSchema, JudgeResultSchema, ListRubricsResponseSchema, + TracesIngestRequestSchema, + TracesIngestResponseSchema, VersionResponseSchema, WIRE_VERSION, } from './schemas' @@ -32,6 +36,10 @@ export function buildOpenApi(packageVersion: string): OpenAPIObject { registry.register('VersionResponse', VersionResponseSchema) registry.register('HealthResponse', HealthResponseSchema) registry.register('ErrorResponse', ErrorResponseSchema) + registry.register('TracesIngestRequest', TracesIngestRequestSchema) + registry.register('TracesIngestResponse', TracesIngestResponseSchema) + registry.register('FeedbackTrajectory', FeedbackTrajectorySchema) + registry.register('FeedbackIngestResponse', FeedbackIngestResponseSchema) // Routes registry.registerPath({ @@ -106,6 +114,73 @@ export function buildOpenApi(packageVersion: string): OpenAPIObject { }, }) + registry.registerPath({ + method: 'post', + path: '/v1/traces/ingest', + summary: 'Ingest a batch of production TraceEvents', + description: + 'Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.', + request: { + body: { + content: { + 'application/json': { schema: TracesIngestRequestSchema }, + 'application/x-ndjson': { schema: TracesIngestRequestSchema }, + }, + }, + }, + responses: { + 200: { + description: 'Ingestion summary', + content: { 'application/json': { schema: TracesIngestResponseSchema } }, + }, + 400: { + description: 'Validation error', + content: { 'application/json': { schema: ErrorResponseSchema } }, + }, + 401: { + description: 'Unauthorized (when bearer auth is configured)', + content: { 'application/json': { schema: ErrorResponseSchema } }, + }, + 503: { + description: 'No trace store configured', + content: { 'application/json': { schema: ErrorResponseSchema } }, + }, + }, + }) + + registry.registerPath({ + method: 'post', + path: '/v1/feedback', + summary: 'Ingest a FeedbackTrajectory from production', + description: + 'Persist a single FeedbackTrajectory. Idempotent on trajectory.id — re-posting replaces the prior record. Used by production runtimes to forward user 👍/👎/edits into the eval substrate.', + request: { + body: { + content: { + 'application/json': { schema: FeedbackTrajectorySchema }, + }, + }, + }, + responses: { + 200: { + description: 'Persisted', + content: { 'application/json': { schema: FeedbackIngestResponseSchema } }, + }, + 400: { + description: 'Validation error', + content: { 'application/json': { schema: ErrorResponseSchema } }, + }, + 401: { + description: 'Unauthorized (when bearer auth is configured)', + content: { 'application/json': { schema: ErrorResponseSchema } }, + }, + 503: { + description: 'No feedback store configured', + content: { 'application/json': { schema: ErrorResponseSchema } }, + }, + }, + }) + const generator = new OpenApiGeneratorV31(registry.definitions) const doc = generator.generateDocument({ openapi: '3.1.0', diff --git a/src/wire/schemas.ts b/src/wire/schemas.ts index fd3e553..1a8c542 100644 --- a/src/wire/schemas.ts +++ b/src/wire/schemas.ts @@ -181,6 +181,171 @@ export const HealthResponseSchema = z }) .openapi('HealthResponse') +// ── Ingestion: production traces + user feedback (0.25.0) ─────────── + +/** + * Minimal `TraceEvent` shape that the production runtime emits. + * Matches `trace/schema.ts` `TraceEvent` but is duplicated here as a + * wire schema so non-TypeScript clients can validate without depending + * on internal types. + */ +export const TraceEventSchema = z + .object({ + eventId: z.string().min(1).describe('Stable id for the event. Use ULID or UUID.'), + runId: z.string().min(1).describe('Run this event belongs to.'), + spanId: z.string().optional().describe('Span that emitted the event, if any.'), + kind: z + .enum([ + 'log', + 'error', + 'budget_decrement', + 'budget_breach', + 'state_mutation', + 'policy_violation', + 'redaction_applied', + 'custom', + ]) + .describe('Coarse event category — matches the TraceSchema v1 EventKind enum.'), + timestamp: z + .number() + .int() + .nonnegative() + .describe('Unix millis. Must be monotonically non-decreasing within a span.'), + payload: z + .record(z.string(), z.unknown()) + .describe('Free-form payload — the runtime owns the shape.'), + }) + .openapi('TraceEvent') + +export const TracesIngestRequestSchema = z + .object({ + events: z + .array(TraceEventSchema) + .min(1) + .max(10_000) + .describe('Batch of events. Max 10k per call — bigger streams should be chunked.'), + }) + .openapi('TracesIngestRequest') + +export const TracesIngestResponseSchema = z + .object({ + accepted: z.number().int().nonnegative().describe('Number of events persisted.'), + rejected: z + .number() + .int() + .nonnegative() + .describe('Number of events the store refused — see `errors[]` for reasons.'), + errors: z + .array( + z.object({ + eventId: z.string().describe('Event id this error applies to.'), + message: z.string().describe('Why the event was rejected.'), + }), + ) + .default([]), + }) + .openapi('TracesIngestResponse') + +export const FeedbackLabelSchema = z + .object({ + id: z.string().optional(), + source: z.enum(['user', 'judge', 'environment', 'metric', 'policy', 'system']), + kind: z.enum([ + 'approve', + 'reject', + 'select', + 'edit', + 'rank', + 'rate', + 'comment', + 'metric_outcome', + 'policy_block', + 'revision_request', + ]), + value: z.unknown(), + reason: z.string().optional(), + severity: z.enum(['info', 'warning', 'error', 'critical']).optional(), + createdAt: z.string().describe('ISO-8601 UTC.'), + metadata: z.record(z.string(), z.unknown()).optional(), + }) + .openapi('FeedbackLabel') + +export const FeedbackAttemptSchema = z + .object({ + id: z.string().min(1), + stepIndex: z.number().int().nonnegative(), + artifactType: z.enum([ + 'text', + 'code', + 'plan', + 'research', + 'action', + 'ui', + 'decision', + 'data', + 'other', + ]), + artifact: z.unknown(), + options: z.array(z.unknown()).optional(), + proposedAction: z + .object({ + type: z.string(), + risk: z.enum(['low', 'medium', 'high']).optional(), + costUsd: z.number().optional(), + externalSideEffect: z.boolean().optional(), + requiresApproval: z.boolean().optional(), + metadata: z.record(z.string(), z.unknown()).optional(), + }) + .optional(), + feedback: z.array(FeedbackLabelSchema).optional(), + createdAt: z.string(), + metadata: z.record(z.string(), z.unknown()).optional(), + }) + .openapi('FeedbackAttempt') + +export const FeedbackTrajectorySchema = z + .object({ + id: z.string().min(1).describe('Stable id; idempotency key for the trajectory.'), + projectId: z.string().optional(), + scenarioId: z.string().optional(), + task: z.object({ + intent: z.string().min(1), + context: z.unknown().optional(), + }), + attempts: z.array(FeedbackAttemptSchema).default([]), + labels: z.array(FeedbackLabelSchema).default([]), + outcome: z + .object({ + success: z.boolean().optional(), + score: z.number().optional(), + metrics: z.record(z.string(), z.number()).optional(), + costUsd: z.number().optional(), + detail: z.string().optional(), + observedAt: z.string().optional(), + metadata: z.record(z.string(), z.unknown()).optional(), + }) + .optional(), + split: z.enum(['train', 'dev', 'test', 'holdout']).optional(), + tags: z.record(z.string(), z.string()).optional(), + createdAt: z.string().describe('ISO-8601 UTC.'), + updatedAt: z.string().optional(), + metadata: z.record(z.string(), z.unknown()).optional(), + }) + .openapi('FeedbackTrajectory') + +export const FeedbackIngestResponseSchema = z + .object({ + id: z.string().describe('Trajectory id that was persisted.'), + persisted: z.boolean().describe('True when the trajectory was saved (idempotent on id).'), + }) + .openapi('FeedbackIngestResponse') + +export type TraceEvent = z.infer +export type TracesIngestRequest = z.infer +export type TracesIngestResponse = z.infer +export type FeedbackTrajectory = z.infer +export type FeedbackIngestResponse = z.infer + // ── Errors ────────────────────────────────────────────────────────── export const ErrorResponseSchema = z diff --git a/src/wire/server.ts b/src/wire/server.ts index e531348..7efdcc5 100644 --- a/src/wire/server.ts +++ b/src/wire/server.ts @@ -6,29 +6,81 @@ * 2. Calls the matching handler in `handlers.ts`. * 3. Renders 4xx for `WireError` with structured body, 500 for unexpected. * - * The server has no internal state besides the handler imports — restart - * costs nothing. Run via `agent-eval serve --port 5005`. + * The server holds optional `IngestionStores` (passed to `createApp`) + * to receive production traces and user feedback. With no stores wired, + * the ingestion endpoints return 503 — read endpoints (`/v1/judge`, + * `/v1/rubrics`, `/v1/version`) remain fully functional. + * + * Run via `agent-eval serve --port 5005`. */ import { type ServerType, serve } from '@hono/node-server' import { Hono } from 'hono' import { cors } from 'hono/cors' -import { handleJudge, handleListRubrics, handleVersion, WireError } from './handlers' +import { + handleFeedbackIngest, + handleJudge, + handleListRubrics, + handleTracesIngest, + handleVersion, + type IngestionStores, + WireError, +} from './handlers' import { buildOpenApi } from './openapi' -import { JudgeRequestSchema } from './schemas' +import { FeedbackTrajectorySchema, JudgeRequestSchema, TracesIngestRequestSchema } from './schemas' const STARTED_AT = Date.now() -export function createApp() { +export interface CreateAppOptions { + /** Stores wired to the ingestion endpoints. */ + stores?: IngestionStores + /** + * Bearer-token auth. When provided, every endpoint EXCEPT `/healthz` + * and `/v1/version` requires `Authorization: Bearer `. The + * token may be a static string OR a function for time-bounded / + * rotating tokens. + * + * Recommended for any server that accepts ingestion writes from the + * public internet. Read-only deployments may omit it. + */ + auth?: { + bearer: string | ((token: string) => boolean | Promise) + } +} + +const AUTH_EXEMPT_PATHS = new Set(['/healthz', '/v1/version', '/openapi.json']) + +export function createApp(opts: CreateAppOptions = {}) { const app = new Hono() app.use('*', cors()) + // Bearer-token middleware (only attached when configured). + if (opts.auth) { + const verify = opts.auth.bearer + app.use('*', async (c, next) => { + const path = new URL(c.req.url).pathname + if (AUTH_EXEMPT_PATHS.has(path)) return next() + const raw = c.req.header('authorization') ?? '' + const match = raw.match(/^Bearer\s+(.+)$/i) + if (!match) { + throw new WireError('unauthorized', 'Missing or malformed Authorization header.', 401) + } + const token = match[1] as string + const ok = typeof verify === 'string' ? token === verify : await verify(token) + if (!ok) { + throw new WireError('unauthorized', 'Invalid bearer token.', 401) + } + return next() + }) + } + app.onError((err, c) => { if (err instanceof WireError) { + const status = err.status as 400 | 401 | 404 | 422 | 500 | 503 return c.json( { error: { code: err.code, message: err.message, details: err.details } }, - err.status as 400 | 404 | 422 | 500, + status, ) } // Unexpected — log and return generic 500 without leaking internals. @@ -66,13 +118,74 @@ export function createApp() { return c.json(result) }) + // ── Traces ingest (NDJSON-friendly: accepts either {events:[...]} or NDJSON) ── + app.post('/v1/traces/ingest', async (c) => { + const contentType = c.req.header('content-type') ?? '' + let payload: unknown + if (contentType.includes('application/x-ndjson')) { + const text = await c.req.text() + const events = text + .split('\n') + .map((line) => line.trim()) + .filter((line) => line.length > 0) + .map((line) => { + try { + return JSON.parse(line) + } catch { + throw new WireError( + 'validation_error', + 'NDJSON line did not parse as JSON.', + 400, + line.slice(0, 200), + ) + } + }) + payload = { events } + } else { + payload = await c.req.json().catch(() => null) + } + if (payload == null) { + throw new WireError('validation_error', 'Request body must be JSON or NDJSON.', 400) + } + const parsed = TracesIngestRequestSchema.safeParse(payload) + if (!parsed.success) { + throw new WireError( + 'validation_error', + 'Request did not match TracesIngestRequest schema.', + 400, + parsed.error.issues, + ) + } + const result = await handleTracesIngest(parsed.data, opts.stores ?? {}) + return c.json(result) + }) + + // ── Feedback ingest ── + app.post('/v1/feedback', async (c) => { + const raw = await c.req.json().catch(() => null) + if (raw == null) { + throw new WireError('validation_error', 'Request body must be JSON.', 400) + } + const parsed = FeedbackTrajectorySchema.safeParse(raw) + if (!parsed.success) { + throw new WireError( + 'validation_error', + 'Request did not match FeedbackTrajectory schema.', + 400, + parsed.error.issues, + ) + } + const result = await handleFeedbackIngest(parsed.data, opts.stores ?? {}) + return c.json(result) + }) + // ── OpenAPI spec ── app.get('/openapi.json', (c) => c.json(buildOpenApi(handleVersion().version))) return app } -export interface ServeOptions { +export interface ServeOptions extends CreateAppOptions { /** Default 5005. */ port?: number /** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */ @@ -80,7 +193,7 @@ export interface ServeOptions { } export function startServer(opts: ServeOptions = {}): ServerType { - const app = createApp() + const app = createApp(opts) const port = opts.port ?? 5005 const host = opts.host ?? '127.0.0.1' return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => { diff --git a/tests/auto-pr.test.ts b/tests/auto-pr.test.ts new file mode 100644 index 0000000..85b59ff --- /dev/null +++ b/tests/auto-pr.test.ts @@ -0,0 +1,290 @@ +/** + * Auto-PR tests. + * + * Regression coverage: + * - input validation (bad branch names, '..' paths, duplicate paths) + * - HTTP client opens a PR via the documented REST sequence (blob → + * tree → commit → ref → pulls) + * - HTTP client is idempotent: existing open PR is returned instead + * of a duplicate create + * - HTTP client fast-forwards the ref when it already exists with a + * different SHA + * - dryRun does not call fetch/exec + * + * No network. + */ +import { describe, expect, it, vi } from 'vitest' + +import { + ghCliClient, + httpGithubClient, + proposeAutomatedPullRequest, +} from '../src/auto-pr' +import { ValidationError } from '../src/errors' + +function jsonResponse(body: unknown, status = 200): Response { + return new Response(JSON.stringify(body), { + status, + headers: { 'content-type': 'application/json' }, + }) +} + +describe('proposeAutomatedPullRequest validation', () => { + const fixture = { + repo: { owner: 'tangle-network', name: 'tax-agent' }, + branchName: 'eval/auto-improve/r1', + fileChanges: [{ path: 'prompts/system.txt', contents: 'new' }], + title: 'feat: production-loop', + body: 'body', + } + + it('rejects an empty repo owner', async () => { + const client = { proposeChange: vi.fn() } + await expect( + proposeAutomatedPullRequest(client, { ...fixture, repo: { owner: '', name: 'x' } }), + ).rejects.toBeInstanceOf(ValidationError) + expect(client.proposeChange).not.toHaveBeenCalled() + }) + + it('rejects whitespace branch names', async () => { + const client = { proposeChange: vi.fn() } + await expect( + proposeAutomatedPullRequest(client, { ...fixture, branchName: 'has space' }), + ).rejects.toBeInstanceOf(ValidationError) + }) + + it('rejects path traversal', async () => { + const client = { proposeChange: vi.fn() } + await expect( + proposeAutomatedPullRequest(client, { + ...fixture, + fileChanges: [{ path: '../etc/passwd', contents: '' }], + }), + ).rejects.toBeInstanceOf(ValidationError) + }) + + it('rejects duplicate paths', async () => { + const client = { proposeChange: vi.fn() } + await expect( + proposeAutomatedPullRequest(client, { + ...fixture, + fileChanges: [ + { path: 'a.txt', contents: '1' }, + { path: 'a.txt', contents: '2' }, + ], + }), + ).rejects.toBeInstanceOf(ValidationError) + }) + + it('rejects branch name equal to base branch', async () => { + const client = { proposeChange: vi.fn() } + await expect( + proposeAutomatedPullRequest(client, { ...fixture, branchName: 'main', baseBranch: 'main' }), + ).rejects.toBeInstanceOf(ValidationError) + }) + + it('empty title is rejected', async () => { + const client = { proposeChange: vi.fn() } + await expect( + proposeAutomatedPullRequest(client, { ...fixture, title: ' ' }), + ).rejects.toBeInstanceOf(ValidationError) + }) + + it('passes through to client.proposeChange on valid input', async () => { + const client = { + proposeChange: vi + .fn() + .mockResolvedValue({ prUrl: 'u', branchName: fixture.branchName, headSha: 's', dryRun: false }), + } + const result = await proposeAutomatedPullRequest(client, fixture) + expect(result.prUrl).toBe('u') + expect(client.proposeChange).toHaveBeenCalledTimes(1) + }) +}) + +describe('httpGithubClient', () => { + function fakeFetch(handler: (url: string, init: RequestInit) => Response): typeof fetch { + return ((url: RequestInfo | URL, init?: RequestInit) => + Promise.resolve(handler(String(url), init ?? {}))) as typeof fetch + } + + function transcript() { + const requests: Array<{ method: string; url: string; body: unknown }> = [] + return { + requests, + record(url: string, init: RequestInit) { + requests.push({ + method: init.method ?? 'GET', + url, + body: init.body ? JSON.parse(init.body as string) : null, + }) + }, + } + } + + it('opens a PR via blob → tree → commit → ref → pulls', async () => { + const t = transcript() + const client = httpGithubClient({ + token: 'test-token', + fetchImpl: fakeFetch((url, init) => { + t.record(url, init) + // Base ref + if (url.endsWith('/git/ref/heads/main')) { + return jsonResponse({ ref: 'refs/heads/main', object: { sha: 'base-sha' } }) + } + // Base commit (for tree.sha) + if (url.endsWith('/git/commits/base-sha')) { + return jsonResponse({ sha: 'base-sha', tree: { sha: 'base-tree-sha' } }) + } + // Create blob + if (url.endsWith('/git/blobs') && init.method === 'POST') { + return jsonResponse({ sha: 'blob-sha-1' }) + } + // Create tree + if (url.endsWith('/git/trees') && init.method === 'POST') { + return jsonResponse({ sha: 'tree-sha-1' }) + } + // Create commit + if (url.endsWith('/git/commits') && init.method === 'POST') { + return jsonResponse({ sha: 'commit-sha-1', tree: { sha: 'tree-sha-1' } }) + } + // Branch ref existence (404 -> not present) + if (url.includes('/git/ref/heads/eval')) { + return jsonResponse({ message: 'Not Found' }, 404) + } + // Create ref + if (url.endsWith('/git/refs') && init.method === 'POST') { + return jsonResponse({ ref: 'refs/heads/eval/r1', object: { sha: 'commit-sha-1' } }) + } + // List PRs (empty -> nothing exists) + if (url.includes('/pulls?')) { + return jsonResponse([]) + } + // Create PR + if (url.endsWith('/pulls') && init.method === 'POST') { + return jsonResponse({ + html_url: 'https://github.com/o/r/pull/1', + number: 1, + }) + } + // Reviewers / labels (best-effort, accept any) + return jsonResponse({}, 200) + }), + now: () => new Date('2026-01-01T00:00:00Z'), + }) + + const result = await client.proposeChange({ + repo: { owner: 'o', name: 'r' }, + branchName: 'eval/r1', + fileChanges: [{ path: 'a.txt', contents: 'hello' }], + title: 'T', + body: 'B', + }) + + expect(result.prUrl).toBe('https://github.com/o/r/pull/1') + expect(result.headSha).toBe('commit-sha-1') + expect(result.dryRun).toBe(false) + + // Verify documented REST sequence in order. + const ordered = t.requests.map((r) => `${r.method} ${r.url.replace(/\?.*$/, '')}`) + expect(ordered).toContain('GET https://api.github.com/repos/o/r/git/ref/heads/main') + expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/blobs') + expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/trees') + expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/commits') + expect(ordered).toContain('POST https://api.github.com/repos/o/r/git/refs') + expect(ordered).toContain('POST https://api.github.com/repos/o/r/pulls') + }) + + it('returns the existing open PR instead of opening a duplicate (idempotency)', async () => { + const client = httpGithubClient({ + token: 'tok', + fetchImpl: fakeFetch((url, init) => { + if (url.endsWith('/git/ref/heads/main')) { + return jsonResponse({ ref: 'refs/heads/main', object: { sha: 'base-sha' } }) + } + if (url.endsWith('/git/commits/base-sha')) { + return jsonResponse({ sha: 'base-sha', tree: { sha: 'base-tree' } }) + } + if (url.endsWith('/git/blobs')) return jsonResponse({ sha: 'b1' }) + if (url.endsWith('/git/trees')) return jsonResponse({ sha: 't1' }) + if (url.endsWith('/git/commits') && init.method === 'POST') { + return jsonResponse({ sha: 'c1', tree: { sha: 't1' } }) + } + // Branch already exists at same sha. + if (url.endsWith('/git/ref/heads/eval/r1')) { + return jsonResponse({ ref: 'refs/heads/eval/r1', object: { sha: 'c1' } }) + } + if (url.includes('/pulls?')) { + return jsonResponse([{ html_url: 'https://github.com/o/r/pull/77', number: 77 }]) + } + return jsonResponse({}, 200) + }), + }) + + const result = await client.proposeChange({ + repo: { owner: 'o', name: 'r' }, + branchName: 'eval/r1', + fileChanges: [{ path: 'a.txt', contents: 'x' }], + title: 'T', + body: 'B', + }) + + expect(result.prUrl).toBe('https://github.com/o/r/pull/77') + }) + + it('dryRun does not call fetch', async () => { + const fetchSpy = vi.fn() + const client = httpGithubClient({ + token: 'tok', + fetchImpl: fetchSpy as unknown as typeof fetch, + }) + const r = await client.proposeChange({ + repo: { owner: 'o', name: 'r' }, + branchName: 'eval/r1', + fileChanges: [{ path: 'a.txt', contents: 'x' }], + title: 'T', + body: 'B', + dryRun: true, + }) + expect(r.dryRun).toBe(true) + expect(fetchSpy).not.toHaveBeenCalled() + }) + + it('surfaces GitHub API failures with status code and body excerpt', async () => { + const client = httpGithubClient({ + token: 'tok', + fetchImpl: () => + Promise.resolve( + new Response('rate limit exceeded', { status: 403, headers: { 'content-type': 'text/plain' } }), + ), + }) + + await expect( + client.proposeChange({ + repo: { owner: 'o', name: 'r' }, + branchName: 'eval/r1', + fileChanges: [{ path: 'a.txt', contents: 'x' }], + title: 'T', + body: 'B', + }), + ).rejects.toThrow(/403/) + }) +}) + +describe('ghCliClient', () => { + it('dryRun returns a synthetic compare URL without exec calls', async () => { + const execSpy = vi.fn() + const client = ghCliClient({ exec: execSpy as unknown as never }) + const r = await client.proposeChange({ + repo: { owner: 'o', name: 'r' }, + branchName: 'eval/r1', + fileChanges: [{ path: 'a.txt', contents: 'x' }], + title: 'T', + body: 'B', + dryRun: true, + }) + expect(r.dryRun).toBe(true) + expect(r.prUrl).toContain('compare/main...eval/r1') + expect(execSpy).not.toHaveBeenCalled() + }) +}) diff --git a/tests/production-loop-example.test.ts b/tests/production-loop-example.test.ts new file mode 100644 index 0000000..51e4179 --- /dev/null +++ b/tests/production-loop-example.test.ts @@ -0,0 +1,186 @@ +/** + * Worked-example regression: runs the same shape as + * `examples/production-loop/index.ts` against the in-memory stores and + * verifies a PR-shaped output. If the example breaks, this test fails + * — which means CI catches example rot without spawning a tsx process. + */ +import { describe, expect, it } from 'vitest' + +import type { + AutoPrClient, + ProposeAutomatedPullRequestInput, + ProposeAutomatedPullRequestResult, +} from '../src/auto-pr' +import { InMemoryFeedbackTrajectoryStore } from '../src/feedback-trajectory' +import { runProductionLoop } from '../src/production-loop' +import { InMemoryTraceStore } from '../src/trace/store' +import type { Scenario } from '../src/types' + +interface TaxAgentPayload { + systemPrompt: string +} + +function scenario(id: string, persona: string): Scenario { + return { + id, + persona, + label: id, + thesis: `Filing scenario: ${id}`, + dimensions: ['correctness'], + turns: [{ user: 'Help me file my taxes', expectedBehaviors: ['cite a statute'] }], + artifactChecks: [], + } +} + +describe('worked example: production-loop demo end-to-end', () => { + it('fires the loop on a synthetic prod failure cluster and produces a PR-shaped output', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + // Seed 8 prod failures in the same cluster. + for (let i = 0; i < 8; i++) { + await traceStore.appendRun({ + runId: `prod-run-${i}`, + scenarioId: 'ftc-noncompete-edge', + startedAt: Date.now() - 3_600_000 + i * 10_000, + endedAt: Date.now() - 3_600_000 + i * 10_000 + 500, + status: 'failed', + outcome: { pass: false, score: 0.2, failureClass: 'instruction_following' }, + }) + await feedbackStore.save({ + id: `ft-${i}`, + scenarioId: 'ftc-noncompete-edge', + task: { intent: 'FTC question' }, + attempts: [], + labels: [ + { + source: 'user', + kind: 'reject', + value: { thumb: 'down' }, + severity: 'error', + createdAt: new Date().toISOString(), + }, + ], + createdAt: new Date().toISOString(), + }) + } + + const captured: ProposeAutomatedPullRequestInput[] = [] + const prClient: AutoPrClient = { + proposeChange(input): Promise { + captured.push(input) + return Promise.resolve({ + prUrl: `https://github.com/o/r/pull/synthetic-1`, + branchName: input.branchName, + headSha: 'cafe1234'.padEnd(40, '0'), + dryRun: false, + }) + }, + } + + const baselinePrompt = + 'You are a tax assistant. Be helpful and concise. ' + + 'Answer questions about US tax forms and rules.' + + const result = await runProductionLoop({ + runId: 'worked-example-prod-loop', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { minClusterSize: 5, minSeverityRatio: 0.05, maxClustersPerCycle: 1 }, + evolve: { + baselinePrompt: { systemPrompt: baselinePrompt }, + holdoutScenarios: [ + scenario('ftc-noncompete-edge', 'small-biz-owner'), + scenario('schedule-c-self-employed', 'freelancer'), + scenario('w2-multistate', 'remote-worker'), + ], + searchScenarios: [scenario('basic-w2', 'salaried'), scenario('joint-return', 'married')], + runner: { + run: ({ variant, scenarioId }) => ({ + trace: { + scenarioId, + turns: [ + { role: 'user', content: 'help' }, + { role: 'assistant', content: variant.payload.systemPrompt }, + ], + transcript: variant.payload.systemPrompt, + }, + costUsd: 0.01, + durationMs: 5, + }), + }, + scorer: { + score: ({ variant }) => { + const lengthSignal = Math.min(1, variant.payload.systemPrompt.length / 300) + const citationBonus = /cite/i.test(variant.payload.systemPrompt) ? 0.3 : 0 + const refuseBonus = /refuse/i.test(variant.payload.systemPrompt) ? 0.15 : 0 + const score = Math.min(1, 0.3 + lengthSignal * 0.4 + citationBonus + refuseBonus) + return { score, ok: score >= 0.6 } + }, + }, + mutator: { + mutate: async ({ parent, childCount, generation }) => + Array.from({ length: childCount }, (_, i) => ({ + id: `${parent.id}-cite-${generation}-${i}`, + label: 'cite-required', + generation, + parentId: parent.id, + payload: { + systemPrompt: + `${parent.payload.systemPrompt} ` + + 'When the question concerns a contested rule, you MUST cite the statute by section number, ' + + 'and you MUST refuse to answer if the rule is not in your active corpus.', + }, + })), + }, + gate: { + baselineKey: 'baseline', + minProductiveRuns: 3, + pairedDeltaThreshold: 0, + overfitGapThreshold: 0.5, + seed: 1234, + }, + reps: 2, + generations: 2, + populationSize: 2, + }, + releaseThresholds: { + requireCorpus: false, + minPassRate: 0.5, + minMeanScore: 0.5, + minSearchRuns: 1, + minHoldoutRuns: 1, + requireAsiForFailures: false, + }, + ship: { + client: prClient, + repo: { owner: 'tangle-network', name: 'tax-agent' }, + branchPrefix: 'eval/auto-improve', + promptFilePath: 'prompts/tax-agent-system.txt', + reviewers: ['drew'], + labels: ['production-loop'], + }, + cron: { cadence: 'weekly' }, + }) + + expect(result.decision).toBe('pr_opened') + expect(result.observedRunCount).toBe(8) + expect(result.observedFeedbackCount).toBe(8) + expect(result.actedOnCluster).not.toBeNull() + expect(result.actedOnCluster?.runCount).toBe(8) + expect(result.gate?.promote).toBe(true) + expect(result.release?.status).toBe('pass') + expect(result.pullRequest).not.toBeNull() + expect(captured).toHaveLength(1) + + const pr = captured[0] + expect(pr).toBeDefined() + expect(pr?.fileChanges).toHaveLength(1) + expect(pr?.fileChanges[0]?.path).toBe('prompts/tax-agent-system.txt') + expect(pr?.fileChanges[0]?.contents).toContain('cite the statute') + expect(pr?.body).toContain('Triggering failure cluster') + expect(pr?.body).toContain('Held-out promotion gate') + expect(pr?.body).toContain('Release confidence') + expect(pr?.title).toContain('tax-agent: production-loop prompt update') + }) +}) diff --git a/tests/production-loop.test.ts b/tests/production-loop.test.ts new file mode 100644 index 0000000..64d3291 --- /dev/null +++ b/tests/production-loop.test.ts @@ -0,0 +1,460 @@ +/** + * Production-loop tests. + * + * Regression coverage: + * - cluster → evolve → propose-PR is deterministic with a fake-fetch + * + fake-gh-api client + * - the loop short-circuits to `no_actionable_failures` when no + * cluster crosses the severity threshold + * - the loop short-circuits to `gate_failed` when the held-out gate + * rejects (fail-closed: a passing evolve does not auto-ship) + * - the loop returns `proposed_change` when no `ship` is wired + * (consumers may persist artifacts without opening a PR) + * - PR body renders cluster details + gate evidence (regression: a + * PR with no context is unreviewable) + * - `AutoPrClient.proposeChange` is called exactly once on success + * + * No network. No `gh` shell-out. Fake clients verify the contract. + */ +import { describe, expect, it } from 'vitest' + +import type { + AutoPrClient, + ProposeAutomatedPullRequestInput, + ProposeAutomatedPullRequestResult, +} from '../src/auto-pr' +import { InMemoryFeedbackTrajectoryStore } from '../src/feedback-trajectory' +import type { + MultiShotMutateAdapter, + MultiShotRunner, + MultiShotScorer, +} from '../src/multi-shot-optimization' +import { runProductionLoop } from '../src/production-loop' +import { InMemoryTraceStore } from '../src/trace/store' +import type { Scenario } from '../src/types' + +interface Payload { + systemPrompt: string +} + +function scenario(id: string): Scenario { + return { + id, + persona: 'tax-filer', + label: id, + thesis: `Scenario ${id}`, + dimensions: ['correctness'], + turns: [{ user: 'Help me file my taxes', expectedBehaviors: ['gather state'] }], + artifactChecks: [], + } +} + +/** + * Deterministic runner: scoring is a pure function of (systemPrompt + * length × scenarioId). A longer prompt wins, modeling the intuition + * that the evolve step is exploring richer prompts. + */ +function makeRunner(): MultiShotRunner { + return { + run: ({ variant, scenarioId }) => ({ + trace: { + scenarioId, + turns: [ + { role: 'user', content: 'help' }, + { role: 'assistant', content: variant.payload.systemPrompt }, + ], + transcript: variant.payload.systemPrompt, + }, + costUsd: 0.01, + durationMs: 5, + }), + } +} + +function makeScorer(): MultiShotScorer { + return { + score: ({ variant }) => { + const score = Math.min(1, variant.payload.systemPrompt.length / 200) + return { score, ok: score >= 0.6 } + }, + } +} + +function makeImprovementMutator(): MultiShotMutateAdapter { + return { + mutate: async ({ parent, childCount, generation }) => { + const longer = `${parent.payload.systemPrompt} ` + + 'Always cite the source statute by section number. ' + + 'Refuse to answer if the FTC rule cited is not in the active corpus. ' + return Array.from({ length: childCount }, (_, i) => ({ + id: `${parent.id}-improved-${generation}-${i}`, + label: 'improved', + generation, + parentId: parent.id, + payload: { systemPrompt: longer }, + })) + }, + } +} + +function makeIdentityMutator(): MultiShotMutateAdapter { + return { + mutate: async () => [], + } +} + +function makeFailedRun(traceStore: InMemoryTraceStore, runId: string, scenarioId: string): void { + traceStore.appendRun({ + runId, + scenarioId, + startedAt: Date.now(), + endedAt: Date.now() + 1, + status: 'failed', + outcome: { pass: false, score: 0.1, failureClass: 'reasoning_error' }, + }) +} + +function fakeAutoPrClient(): { client: AutoPrClient; calls: ProposeAutomatedPullRequestInput[] } { + const calls: ProposeAutomatedPullRequestInput[] = [] + const client: AutoPrClient = { + proposeChange(input): Promise { + calls.push(input) + return Promise.resolve({ + prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/pull/42`, + branchName: input.branchName, + headSha: 'cafe1234'.padEnd(40, '0'), + dryRun: false, + }) + }, + } + return { client, calls } +} + +describe('runProductionLoop', () => { + it('returns no_actionable_failures when the failure cluster is below the severity threshold', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + // 1 failure out of 1 run, but minClusterSize=5 forces no action. + makeFailedRun(traceStore, 'r-1', 'noisy') + + const result = await runProductionLoop({ + runId: 'prod-loop-quiet', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { minClusterSize: 5, minSeverityRatio: 0.5 }, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeImprovementMutator(), + baselinePrompt: { systemPrompt: 'You are a tax assistant.' }, + holdoutScenarios: [scenario('hold-a'), scenario('hold-b')], + gate: { baselineKey: 'baseline', minProductiveRuns: 1, pairedDeltaThreshold: 0 }, + }, + }) + + expect(result.decision).toBe('no_actionable_failures') + expect(result.evolution).toBeNull() + expect(result.pullRequest).toBeNull() + expect(result.promotedPrompt).toEqual({ systemPrompt: 'You are a tax assistant.' }) + expect(result.observedRunCount).toBe(1) + }) + + it('runs evolve and returns proposed_change when ship is not wired', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + // 5 failed runs in the same cluster — crosses minClusterSize=5. + for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge') + + const result = await runProductionLoop({ + runId: 'prod-loop-evolve', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { minClusterSize: 5, minSeverityRatio: 0 }, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeImprovementMutator(), + baselinePrompt: { systemPrompt: 'Short.' }, + holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')], + searchScenarios: [scenario('search-a'), scenario('search-b')], + gate: { + baselineKey: 'baseline', + minProductiveRuns: 1, + pairedDeltaThreshold: -1, + overfitGapThreshold: 1, + seed: 7, + }, + reps: 1, + generations: 2, + populationSize: 2, + }, + releaseThresholds: { + minPassRate: 0.0, + minMeanScore: 0.0, + minSearchRuns: 1, + minHoldoutRuns: 1, + maxOverfitGap: 1, + requireAsiForFailures: false, + requireCorpus: false, + }, + }) + + expect(result.decision).toBe('proposed_change') + expect(result.evolution).not.toBeNull() + expect(result.actedOnCluster).not.toBeNull() + expect(result.pullRequest).toBeNull() + // The mutator extends the prompt — promoted must differ from baseline. + expect((result.promotedPrompt as Payload).systemPrompt).not.toBe('Short.') + expect((result.promotedPrompt as Payload).systemPrompt.length).toBeGreaterThan( + (result.baselinePrompt as Payload).systemPrompt.length, + ) + }) + + it('opens a PR when ship is wired, gate passes, and release is green', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge') + const { client, calls } = fakeAutoPrClient() + + const result = await runProductionLoop({ + runId: 'prod-loop-ship', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { minClusterSize: 5, minSeverityRatio: 0 }, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeImprovementMutator(), + baselinePrompt: { systemPrompt: 'Short.' }, + holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')], + searchScenarios: [scenario('search-a'), scenario('search-b')], + gate: { + baselineKey: 'baseline', + minProductiveRuns: 1, + pairedDeltaThreshold: -1, + overfitGapThreshold: 1, + seed: 7, + }, + reps: 1, + generations: 2, + populationSize: 2, + }, + releaseThresholds: { + minPassRate: 0.0, + minMeanScore: 0.0, + minSearchRuns: 1, + minHoldoutRuns: 1, + maxOverfitGap: 1, + requireAsiForFailures: false, + requireCorpus: false, + }, + ship: { + client, + repo: { owner: 'tangle-network', name: 'tax-agent' }, + branchPrefix: 'eval/auto-improve', + promptFilePath: 'prompts/system.txt', + reviewers: ['drew'], + labels: ['production-loop'], + }, + cron: { cadence: 'weekly' }, + }) + + expect(result.decision).toBe('pr_opened') + expect(result.pullRequest).not.toBeNull() + expect(result.pullRequest?.prUrl).toContain('/pull/42') + expect(result.pullRequest?.branchName).toBe('eval/auto-improve/prod-loop-ship') + // The PR must carry the prompt file diff + cluster context. + expect(calls).toHaveLength(1) + expect(calls[0]?.fileChanges).toHaveLength(1) + expect(calls[0]?.fileChanges[0]?.path).toBe('prompts/system.txt') + expect(calls[0]?.body).toContain('Held-out promotion gate') + expect(calls[0]?.body).toContain('Release confidence') + expect(calls[0]?.body).toContain('Triggering failure cluster') + expect(calls[0]?.reviewers).toEqual(['drew']) + expect(calls[0]?.labels).toEqual(['production-loop']) + expect(result.cron?.cadence).toBe('weekly') + }) + + it('fails closed: when the held-out gate rejects, no PR is opened', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge') + const { client, calls } = fakeAutoPrClient() + + // Mutator returns variants that SCORE WORSE — the gate will reject. + const worseMutator: MultiShotMutateAdapter = { + mutate: async ({ parent, childCount, generation }) => + Array.from({ length: childCount }, (_, i) => ({ + id: `${parent.id}-worse-${generation}-${i}`, + label: 'worse', + generation, + parentId: parent.id, + payload: { systemPrompt: '' }, + })), + } + + const result = await runProductionLoop({ + runId: 'prod-loop-reject', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { minClusterSize: 5, minSeverityRatio: 0 }, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: worseMutator, + baselinePrompt: { + systemPrompt: + 'You are a tax assistant. Cite sources. Refuse unsupported claims. Walk through state-by-state.', + }, + holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')], + searchScenarios: [scenario('search-a'), scenario('search-b')], + gate: { + baselineKey: 'baseline', + minProductiveRuns: 1, + pairedDeltaThreshold: 0, + overfitGapThreshold: 1, + seed: 7, + }, + reps: 1, + generations: 2, + populationSize: 2, + }, + releaseThresholds: { + minPassRate: 0.0, + minMeanScore: 0.0, + minSearchRuns: 1, + minHoldoutRuns: 1, + maxOverfitGap: 1, + requireAsiForFailures: false, + requireCorpus: false, + }, + ship: { + client, + repo: { owner: 'tangle-network', name: 'tax-agent' }, + branchPrefix: 'eval/auto-improve', + promptFilePath: 'prompts/system.txt', + }, + }) + + // Either evolve_yielded_no_improvement (when search-best stays baseline) + // or gate_failed (when search-best wins search but holdout fails). Both + // mean: no PR opened. That's the load-bearing assertion. + expect(['gate_failed', 'evolve_yielded_no_improvement']).toContain(result.decision) + expect(result.pullRequest).toBeNull() + expect(calls).toHaveLength(0) + }) + + it('returns evolve_yielded_no_improvement when the mutator returns no children', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge') + + const result = await runProductionLoop({ + runId: 'prod-loop-noop', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { minClusterSize: 5, minSeverityRatio: 0 }, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeIdentityMutator(), + baselinePrompt: { systemPrompt: 'baseline-only.' }, + holdoutScenarios: [scenario('hold-a'), scenario('hold-b'), scenario('hold-c')], + searchScenarios: [scenario('search-a'), scenario('search-b')], + gate: { baselineKey: 'baseline', minProductiveRuns: 1, pairedDeltaThreshold: 0, seed: 7 }, + reps: 1, + generations: 1, + populationSize: 1, + }, + releaseThresholds: { requireCorpus: false, requireAsiForFailures: false }, + }) + + expect(result.decision).toBe('evolve_yielded_no_improvement') + expect(result.pullRequest).toBeNull() + }) + + it('validates inputs (regression: empty runId, empty holdout, conflicting search/holdout)', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + + await expect( + runProductionLoop({ + runId: ' ', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: {}, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeImprovementMutator(), + baselinePrompt: { systemPrompt: 'x' }, + holdoutScenarios: [scenario('h-a')], + gate: { baselineKey: 'baseline' }, + }, + }), + ).rejects.toThrow(/runId required/) + + await expect( + runProductionLoop({ + runId: 'r', + target: ' ', + traceStore, + feedbackStore, + cluster: {}, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeImprovementMutator(), + baselinePrompt: { systemPrompt: 'x' }, + holdoutScenarios: [scenario('h-a')], + gate: { baselineKey: 'baseline' }, + }, + }), + ).rejects.toThrow(/target required/) + + await expect( + runProductionLoop({ + runId: 'r', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: {}, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeImprovementMutator(), + baselinePrompt: { systemPrompt: 'x' }, + holdoutScenarios: [], + gate: { baselineKey: 'baseline' }, + }, + }), + ).rejects.toThrow(/holdoutScenarios must not be empty/) + + // Search/holdout overlap. + for (let i = 0; i < 5; i++) makeFailedRun(traceStore, `r-${i}`, 'tax-edge') + await expect( + runProductionLoop({ + runId: 'r', + target: 'tax-agent', + traceStore, + feedbackStore, + cluster: { minClusterSize: 5, minSeverityRatio: 0 }, + evolve: { + runner: makeRunner(), + scorer: makeScorer(), + mutator: makeImprovementMutator(), + baselinePrompt: { systemPrompt: 'x' }, + holdoutScenarios: [scenario('a')], + searchScenarios: [scenario('a')], + gate: { baselineKey: 'baseline' }, + }, + }), + ).rejects.toThrow(/disjoint/) + }) +}) diff --git a/tests/wire-ingestion.test.ts b/tests/wire-ingestion.test.ts new file mode 100644 index 0000000..bdca59e --- /dev/null +++ b/tests/wire-ingestion.test.ts @@ -0,0 +1,295 @@ +/** + * Wire-protocol ingestion tests (0.25.0). + * + * Regression coverage: + * - POST /v1/feedback persists into the configured FeedbackTrajectoryStore + * - POST /v1/feedback returns 400 ValidationError on malformed payload + * - POST /v1/feedback returns 503 when no store is wired + * - POST /v1/traces/ingest accepts both JSON ({events:[...]}) and NDJSON + * - POST /v1/traces/ingest reports per-event errors without poisoning the batch + * - Bearer auth (when configured) blocks ingestion without a valid token + * but never blocks /healthz or /v1/version + * - OpenAPI doc lists the new endpoints + components + */ +import { describe, expect, it } from 'vitest' + +import { InMemoryFeedbackTrajectoryStore } from '../src/feedback-trajectory' +import { InMemoryTraceStore } from '../src/trace/store' +import { createApp } from '../src/wire/server' + +async function postJson(app: ReturnType, path: string, body: unknown, headers?: Record) { + const res = await app.fetch( + new Request(`http://localhost${path}`, { + method: 'POST', + headers: { 'content-type': 'application/json', ...headers }, + body: JSON.stringify(body), + }), + ) + const responseBody = res.status === 204 ? null : await res.json().catch(() => null) + return { status: res.status, body: responseBody } +} + +async function postRaw( + app: ReturnType, + path: string, + body: string, + contentType: string, + headers?: Record, +) { + const res = await app.fetch( + new Request(`http://localhost${path}`, { + method: 'POST', + headers: { 'content-type': contentType, ...headers }, + body, + }), + ) + const responseBody = res.status === 204 ? null : await res.json().catch(() => null) + return { status: res.status, body: responseBody } +} + +function validFeedback(id = 'ft_1') { + return { + id, + task: { intent: 'help filing 2025 taxes' }, + attempts: [], + labels: [ + { + source: 'user', + kind: 'approve', + value: { thumb: 'up' }, + createdAt: '2026-05-14T00:00:00Z', + }, + ], + createdAt: '2026-05-14T00:00:00Z', + } +} + +describe('POST /v1/feedback', () => { + it('persists a feedback trajectory and returns 200 with id + persisted:true', async () => { + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + const app = createApp({ stores: { feedbackStore } }) + + const r = await postJson(app, '/v1/feedback', validFeedback('ft_persist')) + expect(r.status).toBe(200) + expect((r.body as { id: string }).id).toBe('ft_persist') + expect((r.body as { persisted: boolean }).persisted).toBe(true) + + const stored = await feedbackStore.get('ft_persist') + expect(stored).not.toBeNull() + expect(stored?.task.intent).toBe('help filing 2025 taxes') + }) + + it('returns 400 ValidationError on malformed payload (regression: silent 200 on bad input)', async () => { + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + const app = createApp({ stores: { feedbackStore } }) + + const r = await postJson(app, '/v1/feedback', { id: 'x' /* missing task */ }) + expect(r.status).toBe(400) + expect((r.body as { error: { code: string } }).error.code).toBe('validation_error') + }) + + it('returns 503 when no feedback store is configured', async () => { + const app = createApp() // no stores + const r = await postJson(app, '/v1/feedback', validFeedback()) + expect(r.status).toBe(503) + expect((r.body as { error: { code: string } }).error.code).toBe('service_unavailable') + }) + + it('is idempotent on id (re-posting replaces)', async () => { + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + const app = createApp({ stores: { feedbackStore } }) + + await postJson(app, '/v1/feedback', validFeedback('ft_dup')) + const second = await postJson(app, '/v1/feedback', { + ...validFeedback('ft_dup'), + task: { intent: 'NEW intent' }, + }) + expect(second.status).toBe(200) + const stored = await feedbackStore.get('ft_dup') + expect(stored?.task.intent).toBe('NEW intent') + }) +}) + +describe('POST /v1/traces/ingest', () => { + it('accepts JSON body and persists events', async () => { + const traceStore = new InMemoryTraceStore() + const app = createApp({ stores: { traceStore } }) + + const r = await postJson(app, '/v1/traces/ingest', { + events: [ + { + eventId: 'e1', + runId: 'r1', + kind: 'log', + timestamp: 1_700_000_000_000, + payload: { msg: 'hello' }, + }, + { + eventId: 'e2', + runId: 'r1', + kind: 'error', + timestamp: 1_700_000_001_000, + payload: { msg: 'boom' }, + }, + ], + }) + + expect(r.status).toBe(200) + expect((r.body as { accepted: number }).accepted).toBe(2) + expect((r.body as { rejected: number }).rejected).toBe(0) + const events = await traceStore.events({ runId: 'r1' }) + expect(events).toHaveLength(2) + expect(events.map((e) => e.eventId).sort()).toEqual(['e1', 'e2']) + }) + + it('accepts NDJSON body and persists events', async () => { + const traceStore = new InMemoryTraceStore() + const app = createApp({ stores: { traceStore } }) + + const ndjson = [ + JSON.stringify({ + eventId: 'n1', + runId: 'r2', + kind: 'log', + timestamp: 1, + payload: { line: 1 }, + }), + JSON.stringify({ + eventId: 'n2', + runId: 'r2', + kind: 'log', + timestamp: 2, + payload: { line: 2 }, + }), + '', + ].join('\n') + + const r = await postRaw(app, '/v1/traces/ingest', ndjson, 'application/x-ndjson') + expect(r.status).toBe(200) + expect((r.body as { accepted: number }).accepted).toBe(2) + + const events = await traceStore.events({ runId: 'r2' }) + expect(events).toHaveLength(2) + }) + + it('returns 400 on malformed schema', async () => { + const traceStore = new InMemoryTraceStore() + const app = createApp({ stores: { traceStore } }) + const r = await postJson(app, '/v1/traces/ingest', { events: [{ eventId: '', runId: 'r1' }] }) + expect(r.status).toBe(400) + expect((r.body as { error: { code: string } }).error.code).toBe('validation_error') + }) + + it('returns 503 when no trace store is configured', async () => { + const app = createApp() + const r = await postJson(app, '/v1/traces/ingest', { + events: [{ eventId: 'e', runId: 'r', kind: 'log', timestamp: 1, payload: {} }], + }) + expect(r.status).toBe(503) + }) +}) + +describe('bearer auth (opt-in)', () => { + it('blocks ingestion without an Authorization header', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + const app = createApp({ + stores: { traceStore, feedbackStore }, + auth: { bearer: 'sk-prod-1234' }, + }) + + const r = await postJson(app, '/v1/feedback', validFeedback()) + expect(r.status).toBe(401) + const r2 = await postJson(app, '/v1/traces/ingest', { + events: [{ eventId: 'e', runId: 'r', kind: 'log', timestamp: 1, payload: {} }], + }) + expect(r2.status).toBe(401) + }) + + it('accepts a valid bearer token', async () => { + const traceStore = new InMemoryTraceStore() + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + const app = createApp({ + stores: { traceStore, feedbackStore }, + auth: { bearer: 'sk-prod-1234' }, + }) + const r = await postJson(app, '/v1/feedback', validFeedback(), { + authorization: 'Bearer sk-prod-1234', + }) + expect(r.status).toBe(200) + }) + + it('rejects a wrong bearer token', async () => { + const traceStore = new InMemoryTraceStore() + const app = createApp({ + stores: { traceStore }, + auth: { bearer: 'sk-prod-1234' }, + }) + const r = await postJson( + app, + '/v1/traces/ingest', + { events: [{ eventId: 'e', runId: 'r', kind: 'log', timestamp: 1, payload: {} }] }, + { authorization: 'Bearer wrong' }, + ) + expect(r.status).toBe(401) + }) + + it('always exempts /healthz and /v1/version (regression: lock-out from monitoring)', async () => { + const app = createApp({ + auth: { bearer: 'sk-prod-1234' }, + stores: {}, + }) + const health = await app.fetch(new Request('http://localhost/healthz')) + expect(health.status).toBe(200) + const version = await app.fetch(new Request('http://localhost/v1/version')) + expect(version.status).toBe(200) + }) + + it('supports a verifier function for rotating tokens', async () => { + const feedbackStore = new InMemoryFeedbackTrajectoryStore() + let calledWith: string | undefined + const app = createApp({ + stores: { feedbackStore }, + auth: { + bearer: (token: string) => { + calledWith = token + return token.startsWith('rotating-') + }, + }, + }) + + const ok = await postJson(app, '/v1/feedback', validFeedback(), { + authorization: 'Bearer rotating-abc', + }) + expect(ok.status).toBe(200) + expect(calledWith).toBe('rotating-abc') + + const bad = await postJson(app, '/v1/feedback', validFeedback('ft_2'), { + authorization: 'Bearer static-token', + }) + expect(bad.status).toBe(401) + }) +}) + +describe('OpenAPI spec', () => { + it('lists the new ingestion endpoints + component schemas', async () => { + const app = createApp() + const res = await app.fetch(new Request('http://localhost/openapi.json')) + expect(res.status).toBe(200) + const spec = (await res.json()) as { + paths: Record + components: { schemas: Record } + } + expect(Object.keys(spec.paths)).toEqual( + expect.arrayContaining(['/v1/feedback', '/v1/traces/ingest']), + ) + expect(Object.keys(spec.components.schemas)).toEqual( + expect.arrayContaining([ + 'FeedbackTrajectory', + 'FeedbackIngestResponse', + 'TracesIngestRequest', + 'TracesIngestResponse', + ]), + ) + }) +})