diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0a61fa6 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + ci: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: pnpm + + - name: Install deps + run: pnpm install --frozen-lockfile + + - name: Lint (biome) + run: pnpm run lint + + - name: Typecheck + run: pnpm run typecheck + + - name: Test + run: pnpm run test + + - name: Build + run: pnpm run build diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..e59070d --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,83 @@ +name: Publish + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +jobs: + verify: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: pnpm + registry-url: https://registry.npmjs.org + + - name: Install deps + run: pnpm install --frozen-lockfile + + - name: Lint (biome) + run: pnpm run lint + + - name: Typecheck + run: pnpm run typecheck + + - name: Test + run: pnpm run test + + - name: Build + run: pnpm run build + + - name: Verify tag/version lock + run: | + NPM_VERSION=$(node -p "require('./package.json').version") + if [[ "${GITHUB_REF:-}" == refs/tags/v* ]]; then + TAG_VERSION="${GITHUB_REF#refs/tags/v}" + if [ "$TAG_VERSION" != "$NPM_VERSION" ]; then + echo "::error::Tag/version mismatch: tag=$TAG_VERSION package=$NPM_VERSION." + exit 1 + fi + fi + echo "Version locked: $NPM_VERSION" + + publish-npm: + needs: verify + if: startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: pnpm + registry-url: https://registry.npmjs.org + + - run: pnpm install --frozen-lockfile + - run: pnpm run build + + # Idempotent: re-running a tag whose npm version is already published + # must not fail the workflow. + - name: Publish to npm (skip if already published) + run: | + NAME=$(node -p "require('./package.json').name") + VERSION=$(node -p "require('./package.json').version") + if npm view "$NAME@$VERSION" version >/dev/null 2>&1; then + echo "$NAME@$VERSION already on registry; skipping publish" + else + pnpm publish --no-git-checks --access public + fi + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/README.md b/README.md index cfd0640..52baedd 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,32 @@ -# agent-runtime - -Reusable runtime lifecycle for domain-specific agents. Standardizes the -task lifecycle (knowledge readiness → questions/acquisition → control loop -→ eval) and delegates domain behavior to an adapter. Owns no domain -policy, models, tools, connectors, or UI. - -## Contents - -- [Overview](#overview) -- [Install](#install) -- [Getting started](#getting-started) -- [When to use which entry point](#when-to-use-which-entry-point) -- [Backends for `runAgentTaskStream`](#backends-for-runagenttaskstream) -- [Lifecycle events](#lifecycle-events) -- [Knowledge providers](#knowledge-providers) -- [Sanitized telemetry](#sanitized-telemetry) -- [Package boundaries](#package-boundaries) -- [Examples](#examples) - -## Overview - -```txt -TaskSpec - → Knowledge readiness - → Question / acquisition decision - → Agent control loop (observe / validate / decide / act) - → Eval / verification - → Run evidence -``` - -For product agents that own a streaming model backend: - -```txt -TaskSpec - → Knowledge readiness - → Session create/resume - → Backend stream - → Sanitized RuntimeStreamEvent / SSE -``` +# @tangle-network/agent-runtime -## Install +Production runtime substrate for domain agents. Owns the task lifecycle +(knowledge readiness, control loop, session resume, sanitized telemetry, +canonical `RuntimeRunRow` persistence + cost ledger) so domain repos stop +inventing their own. ```bash pnpm add @tangle-network/agent-runtime @tangle-network/agent-eval ``` -## Getting started +## What you get + +| Entry point | When to reach for it | +|---|---| +| `runAgentTask` | Single-shot adapter-driven task with eval/verification | +| `runAgentTaskStream` | Streaming product loop with session resume + backends | +| `startRuntimeRun` | Canonical production-run row + cost ledger (NEW in 0.7.0) | +| `createTraceBridge` | Map `RuntimeStreamEvent` → `agent-eval` `TraceEvent` (NEW in 0.7.0) | +| `decideKnowledgeReadiness` | `ready` / `blocked` / `caveat` branch for routes / UI | +| `createOpenAICompatibleBackend` | OpenAI-compatible streaming backend (TCloud / cli-bridge) | +| `createSandboxPromptBackend` | Sandbox / sidecar `streamPrompt` clients | +| `createRuntimeStreamEventCollector` | Default-redacted sanitized telemetry over a stream | + +Every public export is annotated `@stable` or `@experimental`. `@stable` +exports do not change shape inside a minor; `@experimental` exports may +change inside a minor and require a deliberate consumer bump. -The smallest possible task — a domain adapter responding to one task with -no streaming: +## Quickstart ```ts import { runAgentTask } from '@tangle-network/agent-runtime' @@ -63,7 +41,7 @@ const result = await runAgentTask({ async observe() { return { /* domain state */ } }, async validate({ state }) { return [/* eval results */] }, async decide({ state }) { - return { kind: 'finish', reason: 'review complete' } + return { type: 'stop', pass: true, score: 1, reason: 'review complete' } }, async act() { return undefined }, }, @@ -72,165 +50,119 @@ const result = await runAgentTask({ console.log(result.status, result.runRecords) ``` -Full runnable: [`examples/basic-task/`](./examples/basic-task/). - -## When to use which entry point - -| You want… | Use | -|---|---| -| Single-shot task with eval/verification | `runAgentTask` | -| Streaming product loop with session resume | `runAgentTaskStream` + a backend factory | -| Just SSE serialization for an existing readiness report | `readinessServerSentEvent` | -| Just sanitized telemetry over an existing run | `createRuntimeEventCollector` (+ `summarizeAgentTaskRun`) for `runAgentTask`, or `createRuntimeStreamEventCollector` for `runAgentTaskStream` | -| Stable readiness branching (`ready` / `blocked` / `caveat`) in a route | `decideKnowledgeReadiness` | - -## Backends for `runAgentTaskStream` +## Canonical production-run lifecycle (NEW in 0.7.0) -Three SDK-agnostic factories ship in core: - -| Factory | When | -|---|---| -| `createOpenAICompatibleBackend` | TCloud / OpenAI-compatible chat APIs | -| `createSandboxPromptBackend` | Sandbox / sidecar `streamPrompt` clients | -| `createIterableBackend` | Custom coding harnesses, browser agents | - -For [cli-bridge](https://github.com/drewstone/cli-bridge) (or any other -OpenAI-compatible HTTP gateway), use `createOpenAICompatibleBackend` pointed -at the gateway's `/v1/chat/completions` URL — the cli-bridge harness/model -selector is just an OpenAI `model` string like `claude/sonnet` or -`codex/gpt-5-codex`. - -Adapters are intentionally thin. Product repos still own client -construction, auth, concrete tool permissions, and UI behavior. See -[`examples/sandbox-stream-backend/`](./examples/sandbox-stream-backend/) and -[`examples/openai-stream-backend/`](./examples/openai-stream-backend/) for -runnable wirings. - -## Lifecycle events - -`runAgentTask` and `runAgentTaskStream` emit typed lifecycle events -through `onEvent`: +`startRuntimeRun` is the ONE abstraction for "the agent did a thing on +behalf of a customer; record what it did, what it cost, how it ended." +Replaces bespoke `agentRuns`-row helpers (legal-agent's +`completeProductionAgentRun` + `persistRuntimeRun` pair is the canonical +example of what this subsumes). ```ts -await runAgentTask({ - task, adapter, - onEvent(event) { - console.log(event.type) - }, +import { startRuntimeRun, runAgentTaskStream } from '@tangle-network/agent-runtime' + +const run = startRuntimeRun({ + workspaceId: 'ws-1', + sessionId: threadId, + agentId: 'legal-chat-runtime', + taskSpec, + scenarioId: `legal-chat:${threadId}`, + adapter: { upsert: (row) => db.insert(agentRuns).values(row) }, }) -``` -Events cover readiness, question answering, acquisition, control-loop -steps, and task completion. Every transition is observable without -coupling domain adapters to logging, streaming, or telemetry concerns. - -This package does **not** stream model tokens for you. Domain adapters -and product routes still own model calls, tool execution, and token -streaming. agent-runtime emits lifecycle events around those actions. +for await (const event of runAgentTaskStream({ task: taskSpec, backend, input })) { + run.observe(event) // llm_call events update the cost ledger + if (event.type === 'final') { + run.complete({ + status: event.status === 'completed' ? 'completed' : 'failed', + resultSummary: event.text ?? '', + error: event.status === 'failed' ? event.reason : undefined, + }) + } +} -## Knowledge providers +await run.persist({ runtimeEvents: telemetry.events }) +console.log(run.cost()) // { tokensIn, tokensOut, costUsd, wallMs, llmCalls } +``` -Optional. A knowledge provider implements: +Full runnable: [`examples/runtime-run/`](./examples/runtime-run/). -- `buildReadiness` — score readiness against the task's required knowledge -- `answerQuestions` — handle outstanding user questions -- `executeAcquisitionPlans` — fetch missing evidence -- `refreshReadiness` — rerun scoring after acquisition +## agent-eval trace bridge (NEW in 0.7.0) -Lets a task collect missing context before the control loop starts, then -rerun readiness against new evidence. If readiness fails, `runAgentTask` -stops before domain actions; adapters can override `onKnowledgeBlocked` -to emit a domain action (asking a user, querying a connector, etc.). +If you persist traces in agent-eval's `TraceStore`, map runtime stream +events to `TraceEvent` once and stop hand-rolling the adapter in every +domain repo: -For control policies or route handlers that need a stable readiness -branch, use `decideKnowledgeReadiness(report)` — it returns `ready`, -`blocked`, or `caveat` plus gap IDs and the recommended action. +```ts +import { createTraceBridge } from '@tangle-network/agent-runtime' -## Sanitized telemetry +const bridge = createTraceBridge({ runId, spanId }) +for await (const event of runAgentTaskStream({ task, backend, input })) { + const trace = bridge.toTraceEvent(event) + if (trace) await traceStore.appendEvent(trace) +} +``` -For logs, reports, UI telemetry — never serialize raw events directly. -Use the built-in sanitized collector: +## Error taxonomy -```ts -import { - createRuntimeEventCollector, - summarizeAgentTaskRun, -} from '@tangle-network/agent-runtime' +Every public function throws one of: -const telemetry = createRuntimeEventCollector() -const result = await runAgentTask({ task, adapter, onEvent: telemetry.onEvent }) +| Error | When | +|---|---| +| `ValidationError` | Caller passed invalid arguments | +| `ConfigError` | Required env / config missing | +| `NotFoundError` | A named resource does not exist | +| `BackendTransportError` | Backend HTTP / IPC call returned non-success | +| `SessionMismatchError` | Resume requested against a different backend | +| `RuntimeRunStateError` | `RuntimeRunHandle` lifecycle methods called out of order | -console.log(telemetry.events) -console.log(summarizeAgentTaskRun(result)) -``` +All extend `AgentEvalError` (re-exported from `@tangle-network/agent-eval`) +and carry a stable `code` so cross-package handlers can pattern-match +without importing the runtime. -By default, the collector redacts task inputs, user answers, credential -questions, control payloads, evidence IDs, task metadata, and eval -details. Private diagnostics opt-in via `RuntimeTelemetryOptions` flags -(`includeInputs`, `includeUserAnswers`, `includeControlPayloads`, -`includeEvidenceIds`, `includeRequirementDescriptions`, -`includeMetadata`, `includeEvalDetails`). +## Sanitized telemetry -For `runAgentTaskStream`, use the sibling -`createRuntimeStreamEventCollector`: +`task.intent` flows through sanitized telemetry on every event. **Never +set it to user input** — use a fixed string describing the operation +kind (e.g. `"Run a chat turn"`, `"Score a tax return"`). Route user- +visible content through `task.inputs` (redacted by default). ```ts -import { - createRuntimeStreamEventCollector, - runAgentTaskStream, -} from '@tangle-network/agent-runtime' +import { createRuntimeStreamEventCollector, runAgentTaskStream } from '@tangle-network/agent-runtime' const telemetry = createRuntimeStreamEventCollector() for await (const event of runAgentTaskStream({ task, backend })) { telemetry.onEvent(event) } - -console.log(telemetry.events) -console.log(telemetry.summary()) +console.log(telemetry.events, telemetry.summary()) ``` -Same `RuntimeTelemetryOptions` flags apply. Streaming and non-streaming -events have different field shapes (timestamps, sessions, text/tool -deltas), which is why the factories are siblings rather than overloads — -a single dispatcher would silently misroute events whose `type` literals -overlap (`task_start`, `readiness_end`, etc.). - -### `task.intent` is sanitized telemetry by default - -`task.intent` flows through sanitized telemetry on every event. **Never -set it to user input** — use a fixed string describing the operation -kind (e.g. `"Run a chat turn"`, `"Score a tax return"`). If you need to -log user-visible intent, route it through `inputs` (which are redacted -by default) instead. - -For SSE-over-HTTP, use the helpers: - -```ts -import { readinessServerSentEvent } from '@tangle-network/agent-runtime' -writer.write(encoder.encode(readinessServerSentEvent(readinessReport))) -``` +By default the collector redacts task inputs, user answers, credential +questions, control payloads, evidence IDs, task metadata, and eval +details. Private diagnostics opt-in via `RuntimeTelemetryOptions`. ## Package boundaries | Package | Owns | |---|---| -| `agent-runtime` | Reusable lifecycle and adapter contracts | -| `agent-eval` | Control loops, readiness scoring, traces, evals, failure classes, optimization, release evidence | +| `agent-runtime` | Lifecycle, adapters, backends, `RuntimeRunHandle`, trace bridge | +| `agent-eval` | Control loops, readiness scoring, traces, evals, failure classes, release evidence | | `agent-knowledge` | Evidence, claims, wiki pages, retrieval, knowledge bundle builders | | Domain packages | Domain tools, policies, credentials, UI text, rubrics | The API uses `runAgentTask`, not `runVerticalAgentTask`. `domain` is -metadata on the task, because the runtime should be reusable across many -kinds of agents without baking taxonomy into type names. +metadata on the task because the runtime is reusable across many kinds of +agents without baking taxonomy into type names. ## Examples Runnable in [`examples/`](./examples/): -- [`basic-task/`](./examples/basic-task/) — the smallest `runAgentTask` -- [`with-knowledge-readiness/`](./examples/with-knowledge-readiness/) — readiness gating + custom `onKnowledgeBlocked` -- [`sanitized-telemetry/`](./examples/sanitized-telemetry/) — `createRuntimeEventCollector` + redaction policy -- [`sanitized-telemetry-streaming/`](./examples/sanitized-telemetry-streaming/) — `createRuntimeStreamEventCollector` + redaction policy for `runAgentTaskStream` +- [`basic-task/`](./examples/basic-task/) — smallest `runAgentTask` +- [`with-knowledge-readiness/`](./examples/with-knowledge-readiness/) — readiness gating + `onKnowledgeBlocked` +- [`sanitized-telemetry/`](./examples/sanitized-telemetry/) — `createRuntimeEventCollector` + redaction +- [`sanitized-telemetry-streaming/`](./examples/sanitized-telemetry-streaming/) — streaming collector + redaction - [`sse-stream/`](./examples/sse-stream/) — Server-Sent Events for browser clients -- [`sandbox-stream-backend/`](./examples/sandbox-stream-backend/) — `runAgentTaskStream` with `createSandboxPromptBackend` (synthetic sandbox client; real one in `agent-builder`) -- [`openai-stream-backend/`](./examples/openai-stream-backend/) — `runAgentTaskStream` with `createOpenAICompatibleBackend` +- [`sandbox-stream-backend/`](./examples/sandbox-stream-backend/) — `createSandboxPromptBackend` +- [`openai-stream-backend/`](./examples/openai-stream-backend/) — `createOpenAICompatibleBackend` +- [`runtime-run/`](./examples/runtime-run/) — `startRuntimeRun` + cost ledger + persistence adapter (NEW) diff --git a/biome.json b/biome.json new file mode 100644 index 0000000..543a0f8 --- /dev/null +++ b/biome.json @@ -0,0 +1,58 @@ +{ + "$schema": "https://biomejs.dev/schemas/2.4.15/schema.json", + "files": { + "includes": ["src/**", "tests/**", "examples/**/*.ts", "examples/**/*.tsx"], + "ignoreUnknown": true + }, + "formatter": { + "enabled": true, + "indentStyle": "space", + "indentWidth": 2, + "lineWidth": 100, + "lineEnding": "lf" + }, + "javascript": { + "formatter": { + "quoteStyle": "single", + "semicolons": "asNeeded", + "trailingCommas": "all", + "arrowParentheses": "always" + } + }, + "linter": { + "enabled": true, + "rules": { + "recommended": true, + "suspicious": { + "noExplicitAny": "off", + "noConsole": "off", + "noAssignInExpressions": "warn", + "noImplicitAnyLet": "warn" + }, + "style": { + "useImportType": "warn", + "useExportType": "warn", + "useNodejsImportProtocol": "error", + "noNonNullAssertion": "off", + "useTemplate": "warn", + "useExponentiationOperator": "warn", + "useShorthandFunctionType": "warn" + }, + "complexity": { + "noUselessTypeConstraint": "warn", + "noBannedTypes": "warn" + }, + "correctness": { + "noUnusedVariables": "off", + "noUnusedImports": "warn" + } + } + }, + "assist": { + "actions": { + "source": { + "organizeImports": "on" + } + } + } +} diff --git a/examples/basic-task/basic-task.ts b/examples/basic-task/basic-task.ts index 7c72c59..722efad 100644 --- a/examples/basic-task/basic-task.ts +++ b/examples/basic-task/basic-task.ts @@ -5,10 +5,12 @@ * pnpm tsx examples/basic-task/basic-task.ts */ -import { runAgentTask } from '@tangle-network/agent-runtime' import type { AgentAdapter } from '@tangle-network/agent-runtime' +import { runAgentTask } from '@tangle-network/agent-runtime' -interface TaxState { reviewCount: number } +interface TaxState { + reviewCount: number +} type TaxAction = { kind: 'review' } let reviews = 0 diff --git a/examples/openai-stream-backend/openai-stream-backend.ts b/examples/openai-stream-backend/openai-stream-backend.ts index 669c015..25c97a5 100644 --- a/examples/openai-stream-backend/openai-stream-backend.ts +++ b/examples/openai-stream-backend/openai-stream-backend.ts @@ -7,8 +7,8 @@ */ import { - InMemoryRuntimeSessionStore, createOpenAICompatibleBackend, + InMemoryRuntimeSessionStore, runAgentTaskStream, runtimeStreamServerSentEvent, } from '@tangle-network/agent-runtime' diff --git a/examples/runtime-run/README.md b/examples/runtime-run/README.md new file mode 100644 index 0000000..a064f2a --- /dev/null +++ b/examples/runtime-run/README.md @@ -0,0 +1,29 @@ +# runtime-run + +Canonical `RuntimeRunHandle` lifecycle: drive a streaming task through +`runAgentTaskStream`, observe `llm_call` events into a cost ledger, and +persist a `RuntimeRunRow` to your durable store. + +Use as the replacement for bespoke `agentRuns`-row plumbing (legal-agent's +`completeProductionAgentRun` + `persistRuntimeRun` pair is the canonical +example of what this pattern subsumes). + +## Run + +```bash +pnpm tsx examples/runtime-run/runtime-run.ts +``` + +## What it shows + +- `startRuntimeRun({ workspaceId, sessionId, taskSpec, adapter })` to open a run +- `handle.observe(event)` per yielded `RuntimeStreamEvent` to keep the cost + ledger in sync (only `llm_call` events contribute; everything else is a + no-op so you can pipe the whole stream through `observe`) +- `handle.complete({ status, resultSummary, error? })` exactly once at end-of- + stream (idempotent for the same status, throws for status transitions) +- `handle.persist()` to write a `RuntimeRunRow` via your + `RuntimeRunPersistenceAdapter` (D1, postgres, KV — anything with an + `upsert(row)`) +- `handle.cost()` returns the accumulated `{ tokensIn, tokensOut, costUsd, + wallMs, llmCalls }` for cost dashboards diff --git a/examples/runtime-run/runtime-run.ts b/examples/runtime-run/runtime-run.ts new file mode 100644 index 0000000..5c57ae0 --- /dev/null +++ b/examples/runtime-run/runtime-run.ts @@ -0,0 +1,121 @@ +/** + * Production-run lifecycle: drive a streaming task through `runAgentTaskStream` + * AND record a canonical `RuntimeRunRow` for cost/audit dashboards. + * + * This is the pattern that replaces legal-agent's bespoke + * `completeProductionAgentRun` + `persistRuntimeRun` pair. Wire it into your + * own DB by implementing the `RuntimeRunPersistenceAdapter` interface (one + * `upsert(row)` method). + * + * Run with: + * pnpm tsx examples/runtime-run/runtime-run.ts + */ + +import { + type AgentBackendInput, + type AgentTaskSpec, + createIterableBackend, + type RuntimeRunPersistenceAdapter, + type RuntimeRunRow, + runAgentTaskStream, + startRuntimeRun, +} from '@tangle-network/agent-runtime' + +const readyTask: AgentTaskSpec = { + id: 'legal-chat:thread-42', + intent: 'Run a legal advisory chat turn with workspace context.', + domain: 'legal', + metadata: { workspaceId: 'ws-1', threadId: 'thread-42' }, +} + +// Toy backend that yields a couple of llm_call events so the cost ledger has +// real input. Real consumers plug in `createOpenAICompatibleBackend`, +// `createSandboxPromptBackend`, or any `AgentExecutionBackend`. +const backend = createIterableBackend({ + kind: 'demo', + async *stream(_input, ctx) { + yield { + type: 'llm_call', + task: ctx.task, + session: ctx.session, + model: 'claude-sonnet-4-6', + tokensIn: 1_200, + tokensOut: 280, + costUsd: 0.0042, + latencyMs: 510, + timestamp: new Date().toISOString(), + } + yield { + type: 'text_delta', + task: ctx.task, + session: ctx.session, + text: 'Reviewed the matter. No blocking issues found.', + timestamp: new Date().toISOString(), + } + yield { + type: 'llm_call', + task: ctx.task, + session: ctx.session, + model: 'claude-sonnet-4-6', + tokensIn: 600, + tokensOut: 110, + costUsd: 0.0019, + latencyMs: 220, + timestamp: new Date().toISOString(), + } + }, +}) + +// In-memory adapter for demonstration. Real adapters write to D1 / postgres / +// the agent's `agentRuns` table — same `upsert(row)` shape. +const persisted: RuntimeRunRow[] = [] +const adapter: RuntimeRunPersistenceAdapter = { + upsert(row) { + persisted.push(row) + }, +} + +async function main(): Promise { + const run = startRuntimeRun({ + workspaceId: 'ws-1', + sessionId: 'thread-42', + agentId: 'legal-chat-runtime', + taskSpec: readyTask, + scenarioId: 'legal-chat:thread-42', + adapter, + }) + + try { + for await (const event of runAgentTaskStream({ + task: readyTask, + backend, + input: { message: 'Please review the latest filing.' }, + })) { + run.observe(event) + if (event.type === 'final') { + const status = event.status === 'completed' ? 'completed' : 'failed' + run.complete({ + status, + resultSummary: status === 'completed' ? 'Reviewed' : 'Stream did not complete cleanly', + error: status === 'failed' ? event.reason : undefined, + }) + } + } + } catch (err) { + run.complete({ + status: 'failed', + resultSummary: 'Stream threw before final event', + error: err instanceof Error ? err.message : String(err), + }) + } + + await run.persist({ note: 'demo persistence metadata' }) + + console.log('Cost ledger:', run.cost()) + console.log('Persisted row:', persisted[0]) +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/examples/sandbox-stream-backend/sandbox-stream-backend.ts b/examples/sandbox-stream-backend/sandbox-stream-backend.ts index be06c54..b830a10 100644 --- a/examples/sandbox-stream-backend/sandbox-stream-backend.ts +++ b/examples/sandbox-stream-backend/sandbox-stream-backend.ts @@ -9,8 +9,8 @@ */ import { - InMemoryRuntimeSessionStore, createSandboxPromptBackend, + InMemoryRuntimeSessionStore, runAgentTaskStream, runtimeStreamServerSentEvent, } from '@tangle-network/agent-runtime' @@ -33,7 +33,7 @@ const sandboxClient = { get(id: string): SandboxBox { return { id, - async * streamPrompt(message: string) { + async *streamPrompt(message: string) { // A real sandbox forwards the prompt to a model + tools and // yields streamed tokens. Here we just yield three fragments. yield { type: 'text_delta' as const, text: `received: ${message}\n` } diff --git a/examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts b/examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts index ec6c209..53f5379 100644 --- a/examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts +++ b/examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts @@ -19,11 +19,11 @@ */ import { + type AgentBackendInput, createIterableBackend, createRuntimeStreamEventCollector, - runAgentTaskStream, - type AgentBackendInput, type RuntimeStreamEvent, + runAgentTaskStream, } from '@tangle-network/agent-runtime' // A synthetic backend that yields a small streaming script. In a real @@ -31,7 +31,7 @@ import { // CLI bridge — the redaction story is identical. const backend = createIterableBackend({ kind: 'demo-stream', - async * stream(_input, ctx) { + async *stream(_input, ctx) { yield { type: 'text_delta', task: ctx.task, diff --git a/examples/sanitized-telemetry/sanitized-telemetry.ts b/examples/sanitized-telemetry/sanitized-telemetry.ts index 2e56f6c..c1bfd57 100644 --- a/examples/sanitized-telemetry/sanitized-telemetry.ts +++ b/examples/sanitized-telemetry/sanitized-telemetry.ts @@ -5,18 +5,26 @@ * pnpm tsx examples/sanitized-telemetry/sanitized-telemetry.ts */ +import type { AgentAdapter } from '@tangle-network/agent-runtime' import { createRuntimeEventCollector, runAgentTask, summarizeAgentTaskRun, } from '@tangle-network/agent-runtime' -import type { AgentAdapter } from '@tangle-network/agent-runtime' const adapter: AgentAdapter<{ ready: boolean }, void, void> = { - async observe() { return { ready: true } }, - async validate() { return [{ id: 'ok', score: 1, passed: true }] }, - async decide() { return { kind: 'finish', reason: 'demo done' } }, - async act() { return undefined }, + async observe() { + return { ready: true } + }, + async validate() { + return [{ id: 'ok', score: 1, passed: true }] + }, + async decide() { + return { kind: 'finish', reason: 'demo done' } + }, + async act() { + return undefined + }, } async function main() { diff --git a/examples/sse-stream/sse-stream.ts b/examples/sse-stream/sse-stream.ts index ff24c51..8f27fbf 100644 --- a/examples/sse-stream/sse-stream.ts +++ b/examples/sse-stream/sse-stream.ts @@ -5,17 +5,14 @@ * pnpm tsx examples/sse-stream/sse-stream.ts */ +import { type KnowledgeRequirement, scoreKnowledgeReadiness } from '@tangle-network/agent-eval' import { - InMemoryRuntimeSessionStore, createIterableBackend, + InMemoryRuntimeSessionStore, readinessServerSentEvent, runAgentTaskStream, runtimeStreamServerSentEvent, } from '@tangle-network/agent-runtime' -import { - scoreKnowledgeReadiness, - type KnowledgeRequirement, -} from '@tangle-network/agent-eval' // ── 1. One-off readiness SSE — the kind of event you'd write to a // response stream when a task is gated by missing knowledge. @@ -49,7 +46,7 @@ process.stdout.write(readinessServerSentEvent(readinessReport)) // they map common shapes for you. const backend = createIterableBackend({ kind: 'demo-iterable', - async * stream(input) { + async *stream(input) { const message = input.message ?? '(no message)' yield { type: 'text_delta' as const, text: `you said: ${message}\n` } yield { type: 'text_delta' as const, text: 'thinking...\n' } diff --git a/examples/with-knowledge-readiness/with-knowledge-readiness.ts b/examples/with-knowledge-readiness/with-knowledge-readiness.ts index cdc1b1f..7414ccf 100644 --- a/examples/with-knowledge-readiness/with-knowledge-readiness.ts +++ b/examples/with-knowledge-readiness/with-knowledge-readiness.ts @@ -6,9 +6,9 @@ * pnpm tsx examples/with-knowledge-readiness/with-knowledge-readiness.ts */ -import { runAgentTask } from '@tangle-network/agent-runtime' -import type { AgentAdapter } from '@tangle-network/agent-runtime' import type { KnowledgeRequirement } from '@tangle-network/agent-eval' +import type { AgentAdapter } from '@tangle-network/agent-runtime' +import { runAgentTask } from '@tangle-network/agent-runtime' function requirement(currentConfidence: number): KnowledgeRequirement { return { @@ -28,10 +28,18 @@ function requirement(currentConfidence: number): KnowledgeRequirement { } const adapter: AgentAdapter<{ ready: boolean }, void, void> = { - observe() { return { ready: true } }, - validate() { return [] }, - decide() { return { kind: 'finish', reason: 'demo done' } }, - act() { return undefined }, + observe() { + return { ready: true } + }, + validate() { + return [] + }, + decide() { + return { kind: 'finish', reason: 'demo done' } + }, + act() { + return undefined + }, } async function main() { @@ -48,7 +56,10 @@ async function main() { console.log('blocked status:', blocked.status) console.log(' readinessScore:', blocked.knowledge.readinessScore) console.log(' recommendedAction:', blocked.knowledge.recommendedAction) - console.log(' blocking gaps:', blocked.knowledge.blockingMissingRequirements.map((r) => r.id)) + console.log( + ' blocking gaps:', + blocked.knowledge.blockingMissingRequirements.map((r) => r.id), + ) // Run 2: full confidence → readiness passes → control loop runs. const ready = await runAgentTask({ diff --git a/package.json b/package.json index d7af3d4..d474e31 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.6.0", + "version": "0.7.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { @@ -34,12 +34,15 @@ "prepare": "tsup", "test": "vitest run", "test:watch": "vitest", + "lint": "biome check src tests examples", + "lint:fix": "biome check --write src tests examples", "typecheck": "tsc --noEmit" }, "dependencies": { - "@tangle-network/agent-eval": "^0.23.0" + "@tangle-network/agent-eval": "^0.24.0" }, "devDependencies": { + "@biomejs/biome": "^2.4.0", "@types/node": "^25.6.0", "tsup": "^8.0.0", "typescript": "^5.7.0", @@ -49,5 +52,10 @@ "node": ">=20" }, "license": "MIT", - "packageManager": "pnpm@10.28.0" + "packageManager": "pnpm@10.28.0", + "pnpm": { + "minimumReleaseAge": 4320, + "minimumReleaseAgeExclude": ["@tangle-network/agent-eval"], + "onlyBuiltDependencies": ["esbuild"] + } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e41c79d..f46b92a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,9 +9,12 @@ importers: .: dependencies: '@tangle-network/agent-eval': - specifier: ^0.23.0 - version: 0.23.0(typescript@5.9.3) + specifier: ^0.24.0 + version: 0.24.0(typescript@5.9.3) devDependencies: + '@biomejs/biome': + specifier: ^2.4.0 + version: 2.4.15 '@types/node': specifier: ^25.6.0 version: 25.6.0 @@ -44,6 +47,59 @@ packages: zod: optional: true + '@biomejs/biome@2.4.15': + resolution: {integrity: sha512-j5VH3a/h/HXTKBM50MDMxRCzkeLv9S2XJcW2WgnZT1+xyisi+0bISrXR82gCX+8S9lvK0skEvHJRN+3Ktr2hlw==} + engines: {node: '>=14.21.3'} + hasBin: true + + '@biomejs/cli-darwin-arm64@2.4.15': + resolution: {integrity: sha512-rF3PPqLq1yoST79zaQbDjVJwsuIeci/O+9bgNmC5QpgOqz6aqYuzA4abyAGx+mgyiDXn4A049xAN8gijbuR1Qg==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [darwin] + + '@biomejs/cli-darwin-x64@2.4.15': + resolution: {integrity: sha512-/5KHXYMfSJs1fNXiX30xFtI8JcCFV6zaVVLxOa0M2sfqBKHkpQhRTv94yxQWxeTY2lzo2OuTlNvPC+hDQt2wcQ==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [darwin] + + '@biomejs/cli-linux-arm64-musl@2.4.15': + resolution: {integrity: sha512-ZPcxznxm0pogHBLZhYntyR3sR+MrZjqJIKEr7ZqVen0Rl+P/4upVmfYXjftizi9RoqZntg33fv/1fbdhbYXpEQ==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [linux] + + '@biomejs/cli-linux-arm64@2.4.15': + resolution: {integrity: sha512-owaAMZD/T4LrD0ELNCk0Km3qrRHuM0X6EAyVE1FSqGY0rbLoiDLrO4Us2tllm6cAeB2Ioa9C2C08NZPdr8+0Ug==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [linux] + + '@biomejs/cli-linux-x64-musl@2.4.15': + resolution: {integrity: sha512-CNq/9W38SYSH023lfcQ4KKU8K0YX8T//FZUhcgtMMRABDojx5XsMV7jlweAvGSl389wJQB29Qo6Zb/a+jdvt+w==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [linux] + + '@biomejs/cli-linux-x64@2.4.15': + resolution: {integrity: sha512-0jj7THz12GbUOLmMibktK6DZjqz2zV64KFxyBtcFTKPiiOIY0a7vns1elpO1dERvxpsZ5ik0oFfz0oGwFde1+g==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [linux] + + '@biomejs/cli-win32-arm64@2.4.15': + resolution: {integrity: sha512-ouhkYdlhp/1GghEJPdWwD/Vi3gQ1nFxuSpMolWsbq3Lsq3QUR4jl6UdhhscdCugKU5vOEuMiJhvKj66O0OCq+w==} + engines: {node: '>=14.21.3'} + cpu: [arm64] + os: [win32] + + '@biomejs/cli-win32-x64@2.4.15': + resolution: {integrity: sha512-zBrGq5mx5wwpnow4+2BxUvleDM+GNd4sLbPaMapsSLQLD0NGRCquqPBTgN+7XkUteHvj7M+BstuI8tmnV7+HgQ==} + engines: {node: '>=14.21.3'} + cpu: [x64] + os: [win32] + '@esbuild/aix-ppc64@0.27.7': resolution: {integrity: sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg==} engines: {node: '>=18'} @@ -386,8 +442,8 @@ packages: '@scure/bip39@2.2.0': resolution: {integrity: sha512-T/Bj/YvYMNkIPq6EENO6/rcs2e7qTNuyoUXf0KBFDmp0ZDu0H2X4Lq6yC3i0c8PcWkov5EbW+yQZZbdMmk154A==} - '@tangle-network/agent-eval@0.23.0': - resolution: {integrity: sha512-YY4J2v1epvTBJ3HeNAYs4AaeurgUZCTfmooGrmDbKeAfWSD6Xzv8RC33xChd1Tge/IGDz1ILRTfpLqyuhNU2aQ==} + '@tangle-network/agent-eval@0.24.0': + resolution: {integrity: sha512-Wwr0qIwI/m/HsNTotyZnfM+aLbcbgpAyGhqjgyO3YqJQaZckJx/DUH0AqppVIfOAoR2vZbdvTRaQNeltDidSrA==} engines: {node: '>=20'} hasBin: true @@ -891,6 +947,41 @@ snapshots: optionalDependencies: zod: 4.4.2 + '@biomejs/biome@2.4.15': + optionalDependencies: + '@biomejs/cli-darwin-arm64': 2.4.15 + '@biomejs/cli-darwin-x64': 2.4.15 + '@biomejs/cli-linux-arm64': 2.4.15 + '@biomejs/cli-linux-arm64-musl': 2.4.15 + '@biomejs/cli-linux-x64': 2.4.15 + '@biomejs/cli-linux-x64-musl': 2.4.15 + '@biomejs/cli-win32-arm64': 2.4.15 + '@biomejs/cli-win32-x64': 2.4.15 + + '@biomejs/cli-darwin-arm64@2.4.15': + optional: true + + '@biomejs/cli-darwin-x64@2.4.15': + optional: true + + '@biomejs/cli-linux-arm64-musl@2.4.15': + optional: true + + '@biomejs/cli-linux-arm64@2.4.15': + optional: true + + '@biomejs/cli-linux-x64-musl@2.4.15': + optional: true + + '@biomejs/cli-linux-x64@2.4.15': + optional: true + + '@biomejs/cli-win32-arm64@2.4.15': + optional: true + + '@biomejs/cli-win32-x64@2.4.15': + optional: true + '@esbuild/aix-ppc64@0.27.7': optional: true @@ -1104,7 +1195,7 @@ snapshots: '@noble/hashes': 2.2.0 '@scure/base': 2.2.0 - '@tangle-network/agent-eval@0.23.0(typescript@5.9.3)': + '@tangle-network/agent-eval@0.24.0(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2) '@ax-llm/ax': 19.0.45(zod@4.4.2) diff --git a/src/backends.ts b/src/backends.ts new file mode 100644 index 0000000..c1a0d0d --- /dev/null +++ b/src/backends.ts @@ -0,0 +1,310 @@ +/** + * @stable + * + * Backend factories for `runAgentTaskStream`. Three shapes ship in core: + * + * - `createIterableBackend` — wrap any custom async iterable into a backend + * - `createSandboxPromptBackend` — sandbox / sidecar `streamPrompt` clients + * - `createOpenAICompatibleBackend` — OpenAI-style chat completions endpoints + * + * Adapters stay thin: domain repos own auth, model selection, and the concrete + * tool surface. The factories handle session creation, stream normalization, + * and graceful end-of-stream signalling. + */ + +import { BackendTransportError } from './errors' +import { newRuntimeSession, nowIso, touchSession } from './sessions' +import type { + AgentBackendContext, + AgentBackendInput, + AgentExecutionBackend, + RuntimeSession, + RuntimeStreamEvent, +} from './types' + +/** @stable */ +export function createIterableBackend(options: { + kind: string + start?: AgentExecutionBackend['start'] + resume?: AgentExecutionBackend['resume'] + stream: AgentExecutionBackend['stream'] + stop?: AgentExecutionBackend['stop'] +}): AgentExecutionBackend { + return options +} + +/** @stable */ +export function createSandboxPromptBackend< + TBox, + TInput extends AgentBackendInput = AgentBackendInput, +>(options: { + kind?: string + getBox(input: TInput, context: Omit): Promise | TBox + streamPrompt(box: TBox, message: string, context: AgentBackendContext): AsyncIterable + mapEvent?: (event: unknown, context: AgentBackendContext) => RuntimeStreamEvent | undefined + getSessionId?: (box: TBox, input: TInput) => string | undefined +}): AgentExecutionBackend { + const kind = options.kind ?? 'sandbox' + return { + kind, + async start(input, context) { + const box = await options.getBox(input, context) + return newRuntimeSession( + kind, + options.getSessionId?.(box, input) ?? context.requestedSessionId, + { resumable: true }, + ) + }, + resume(session) { + return touchSession({ ...session, status: 'active' }) + }, + async *stream(input, context) { + const box = await options.getBox(input, context) + const message = input.message ?? input.messages?.at(-1)?.content ?? context.task.intent + for await (const event of options.streamPrompt(box, message, context)) { + const mapped = options.mapEvent?.(event, context) ?? mapCommonBackendEvent(event, context) + if (mapped) yield mapped + } + }, + } +} + +/** @stable */ +export function createOpenAICompatibleBackend< + TInput extends AgentBackendInput = AgentBackendInput, +>(options: { + apiKey: string + baseUrl: string + model: string + kind?: string + fetchImpl?: typeof fetch +}): AgentExecutionBackend { + const fetcher = options.fetchImpl ?? fetch + const kind = options.kind ?? 'tcloud' + return { + kind, + start(_input, context) { + return newRuntimeSession(kind, context.requestedSessionId) + }, + async *stream(input, context) { + const response = await fetcher(`${options.baseUrl.replace(/\/$/, '')}/chat/completions`, { + method: 'POST', + headers: { + Authorization: `Bearer ${options.apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: options.model, + stream: true, + messages: input.messages ?? [ + { role: 'user', content: input.message ?? context.task.intent }, + ], + }), + signal: context.signal, + }) + if (!response.ok) { + throw new BackendTransportError(kind, `chat backend returned ${response.status}`, { + status: response.status, + }) + } + yield* streamResponseEvents(response, context) + }, + } +} + +/** @internal */ +export function normalizeBackendStreamEvent( + event: RuntimeStreamEvent, + task: AgentBackendContext['task'], + session: RuntimeSession, +): RuntimeStreamEvent { + if ( + 'task' in event && + event.task && + 'session' in event && + event.session && + 'timestamp' in event && + event.timestamp + ) { + return event + } + return { + ...event, + task: 'task' in event && event.task ? event.task : task, + session: 'session' in event && event.session ? event.session : session, + timestamp: 'timestamp' in event && event.timestamp ? event.timestamp : nowIso(), + } as RuntimeStreamEvent +} + +function mapCommonBackendEvent( + event: unknown, + context: AgentBackendContext, +): RuntimeStreamEvent | undefined { + if (!event || typeof event !== 'object') return undefined + const record = event as Record + const type = String(record.type ?? '') + const data = + record.data && typeof record.data === 'object' + ? (record.data as Record) + : record + if (type === 'message.part.updated' || type === 'text_delta' || type === 'delta') { + const text = stringValue(data.text) ?? stringValue(data.delta) ?? stringValue(record.text) + return text + ? { + type: 'text_delta', + task: context.task, + session: context.session, + text, + timestamp: nowIso(), + } + : undefined + } + if (type === 'reasoning_delta') { + const text = stringValue(data.text) ?? stringValue(record.text) + return text + ? { + type: 'reasoning_delta', + task: context.task, + session: context.session, + text, + timestamp: nowIso(), + } + : undefined + } + if (type === 'tool_call') { + return { + type: 'tool_call', + task: context.task, + session: context.session, + toolName: stringValue(data.name) ?? stringValue(record.toolName) ?? 'tool', + toolCallId: stringValue(data.id) ?? stringValue(record.toolCallId), + args: data.args ?? data.input ?? record.args, + timestamp: nowIso(), + } + } + if (type === 'tool_result') { + return { + type: 'tool_result', + task: context.task, + session: context.session, + toolName: stringValue(data.name) ?? stringValue(record.toolName) ?? 'tool', + toolCallId: stringValue(data.id) ?? stringValue(record.toolCallId), + result: data.result ?? data.output ?? record.result, + timestamp: nowIso(), + } + } + if (type === 'result' || type === 'final') { + const text = stringValue(data.finalText) ?? stringValue(data.text) ?? stringValue(record.text) + return text + ? { + type: 'text_delta', + task: context.task, + session: context.session, + text, + timestamp: nowIso(), + } + : undefined + } + return undefined +} + +async function* streamResponseEvents( + response: Response, + context: AgentBackendContext, +): AsyncIterable { + const body = response.body + if (!body) return + const reader = body.getReader() + const decoder = new TextDecoder() + let buffer = '' + for (;;) { + const { done, value } = await reader.read() + if (done) break + buffer += decoder.decode(value, { stream: true }).replace(/\r\n/g, '\n') + for (const event of drainStreamBuffer(false)) yield event + } + buffer += decoder.decode().replace(/\r\n/g, '\n') + for (const event of drainStreamBuffer(true)) yield event + if (buffer.trim()) { + const event = parseStreamChunk(buffer, context) + if (event) yield event + } + + function* drainStreamBuffer(flush: boolean): Iterable { + for (;;) { + const sseBoundary = buffer.indexOf('\n\n') + if (sseBoundary >= 0) { + const chunk = buffer.slice(0, sseBoundary) + buffer = buffer.slice(sseBoundary + 2) + const event = parseStreamChunk(chunk, context) + if (event) yield event + continue + } + + const newline = buffer.indexOf('\n') + if (newline >= 0 && !buffer.slice(0, newline).startsWith('data:')) { + const line = buffer.slice(0, newline) + buffer = buffer.slice(newline + 1) + const event = parseStreamChunk(line, context) + if (event) yield event + continue + } + + if (flush && buffer.trim() && !buffer.trimStart().startsWith('data:')) { + const line = buffer + buffer = '' + const event = parseStreamChunk(line, context) + if (event) yield event + continue + } + + break + } + } +} + +function parseStreamChunk( + chunk: string, + context: AgentBackendContext, +): RuntimeStreamEvent | undefined { + const lines = chunk.split(/\r?\n/) + const dataLines = lines.filter((line) => line.startsWith('data:')) + const data = + dataLines.length > 0 + ? dataLines.map((line) => line.slice(5).trimStart()).join('\n') + : chunk.trim() + if (!data || data === '[DONE]') return undefined + try { + const parsed = JSON.parse(data) as Record + const choices = parsed.choices + const choice = Array.isArray(choices) + ? (choices[0] as Record | undefined) + : undefined + const delta = choice?.delta as Record | undefined + const message = choice?.message as Record | undefined + const text = + stringValue(delta?.content) ?? stringValue(message?.content) ?? stringValue(parsed.text) + if (text) { + return { + type: 'text_delta', + task: context.task, + session: context.session, + text, + timestamp: nowIso(), + } + } + return mapCommonBackendEvent(parsed, context) + } catch { + return { + type: 'text_delta', + task: context.task, + session: context.session, + text: data, + timestamp: nowIso(), + } + } +} + +function stringValue(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} diff --git a/src/errors.ts b/src/errors.ts new file mode 100644 index 0000000..3071021 --- /dev/null +++ b/src/errors.ts @@ -0,0 +1,89 @@ +/** + * @stable + * + * Error taxonomy for `@tangle-network/agent-runtime`. + * + * Public contract: every error this package throws as part of its consumer- + * facing API either extends `AgentEvalError` (re-exported here for ergonomic + * `instanceof` checks at the runtime boundary) or extends one of the + * runtime-specific subclasses below. + * + * Internal invariant guards (`throw new Error('this should never happen')`) + * remain plain `Error` — they are programmer-mistake assertions, not + * consumer-catchable contract failures. + * + * Subclassing strategy: where a runtime-specific failure maps cleanly to an + * agent-eval code (validation, config, not_found), we re-use the agent-eval + * subclass. Runtime-only failure modes (session resume against the wrong + * backend, backend transport errors) get fresh subclasses that still carry an + * `AgentEvalErrorCode` so cross-package handlers can pattern-match without + * importing the runtime. + */ + +import { AgentEvalError } from '@tangle-network/agent-eval' + +export { + AgentEvalError, + type AgentEvalErrorCode, + CaptureIntegrityError, + ConfigError, + JudgeError, + NotFoundError, + ReplayError, + ValidationError, + VerificationError, +} from '@tangle-network/agent-eval' + +/** + * @stable + * + * Caller asked to resume a session against a backend whose `kind` does not + * match the session's recorded backend. This is a routing bug — the same + * session id was reused across two different backend implementations — and + * is not retryable without picking the right backend. + */ +export class SessionMismatchError extends AgentEvalError { + readonly sessionBackend: string + readonly requestedBackend: string + + constructor(sessionBackend: string, requestedBackend: string, options?: { cause?: unknown }) { + super( + 'validation', + `Cannot resume ${sessionBackend} session with ${requestedBackend} backend`, + options, + ) + this.sessionBackend = sessionBackend + this.requestedBackend = requestedBackend + } +} + +/** + * @stable + * + * A backend transport call (HTTP, gRPC, sidecar IPC) failed with a non-success + * status. Distinct from `JudgeError` (which is structural / unrecoverable) + * because backend failures are sometimes retryable and consumers may want to + * branch on the upstream status code. + */ +export class BackendTransportError extends AgentEvalError { + readonly backend: string + readonly status?: number + + constructor(backend: string, message: string, options?: { cause?: unknown; status?: number }) { + super('config', message, options) + this.backend = backend + this.status = options?.status + } +} + +/** + * @stable + * + * A runtime-run lifecycle method was called in an order the state machine does + * not allow: `persist()` before `complete()`, `complete()` twice, etc. + */ +export class RuntimeRunStateError extends AgentEvalError { + constructor(message: string, options?: { cause?: unknown }) { + super('validation', message, options) + } +} diff --git a/src/index.ts b/src/index.ts index ca4ae87..65597eb 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,1379 +1,17 @@ -import { - acquisitionPlansForKnowledgeGaps, - blockingKnowledgeEval, - runAgentControlLoop, - scoreKnowledgeReadiness, - userQuestionsForKnowledgeGaps, - type ControlBudget, - type ControlContext, - type ControlDecision, - type ControlEvalResult, - type ControlRunResult, - type ControlStep, - type DataAcquisitionPlan, - type KnowledgeReadinessReport, - type KnowledgeRequirement, - type RunRecord, - type TraceStore, - type UserQuestion, -} from '@tangle-network/agent-eval' - -export interface AgentTaskSpec { - id: string - intent: string - /** Domain is metadata, not an architectural boundary: tax, legal, gtm, creative, blueprint, redteam, etc. */ - domain?: string - inputs?: Record - requiredKnowledge?: KnowledgeRequirement[] - budget?: Partial - metadata?: Record -} - -export interface AgentKnowledgeProvider { - buildReadiness?(task: AgentTaskSpec): Promise | KnowledgeReadinessReport - answerQuestions?(questions: UserQuestion[], task: AgentTaskSpec): Promise> | Record - executeAcquisitionPlans?(plans: DataAcquisitionPlan[], task: AgentTaskSpec): Promise | string[] - refreshReadiness?(input: { - task: AgentTaskSpec - previous: KnowledgeReadinessReport - userAnswers: Record - acquiredEvidenceIds: string[] - }): Promise | KnowledgeReadinessReport -} - -export interface AgentTaskContext { - task: AgentTaskSpec - knowledge: KnowledgeReadinessReport - state: TState - evals: TEval[] - history: ControlStep[] - budget: ControlBudget - stepIndex: number - wallMs: number - spentCostUsd: number - remainingCostUsd?: number - abortSignal: AbortSignal -} - -export interface AgentAdapter { - observe(ctx: { - task: AgentTaskSpec - knowledge: KnowledgeReadinessReport - history: ControlStep[] - abortSignal: AbortSignal - }): Promise | TState - - validate(ctx: { - task: AgentTaskSpec - knowledge: KnowledgeReadinessReport - state: TState - history: ControlStep[] - abortSignal: AbortSignal - }): Promise | TEval[] - - decide(ctx: AgentTaskContext): Promise> | ControlDecision - - act(action: TAction, ctx: AgentTaskContext): Promise | TActionResult - - shouldStop?(ctx: AgentTaskContext): Promise<{ - stop: boolean - pass: boolean - reason: string - score?: number - }> | { - stop: boolean - pass: boolean - reason: string - score?: number - } - - onKnowledgeBlocked?(ctx: { - task: AgentTaskSpec - knowledge: KnowledgeReadinessReport - questions: UserQuestion[] - acquisitionPlans: DataAcquisitionPlan[] - }): Promise> | ControlDecision - - getActionCostUsd?(ctx: { - action: TAction - result: TActionResult - task: AgentTaskSpec - state: TState - evals: TEval[] - history: ControlStep[] - }): number | undefined - - projectRunRecords?(result: ControlRunResult, task: AgentTaskSpec): RunRecord[] -} - -export type AgentTaskStatus = - | 'completed' - | 'blocked' - | 'failed' - | 'aborted' - -export type AgentRuntimeEvent = - | { type: 'task_start'; task: AgentTaskSpec } - | { type: 'readiness_start'; task: AgentTaskSpec } - | { type: 'readiness_end'; task: AgentTaskSpec; knowledge: KnowledgeReadinessReport } - | { type: 'questions_start'; task: AgentTaskSpec; questions: UserQuestion[] } - | { type: 'questions_end'; task: AgentTaskSpec; questions: UserQuestion[]; userAnswers: Record } - | { type: 'acquisition_start'; task: AgentTaskSpec; acquisitionPlans: DataAcquisitionPlan[] } - | { type: 'acquisition_end'; task: AgentTaskSpec; acquisitionPlans: DataAcquisitionPlan[]; acquiredEvidenceIds: string[] } - | { type: 'control_start'; task: AgentTaskSpec; knowledge: KnowledgeReadinessReport } - | { type: 'control_step'; task: AgentTaskSpec; step: ControlStep } - | { type: 'control_end'; task: AgentTaskSpec; control: ControlRunResult } - | { type: 'task_end'; task: AgentTaskSpec; status: AgentTaskStatus; reason: string } - -export type AgentRuntimeEventSink = ( - event: AgentRuntimeEvent, -) => Promise | void - -export type RuntimeStreamEvent = - | { type: 'task_start'; task: AgentTaskSpec; timestamp: string } - | { type: 'readiness_start'; task: AgentTaskSpec; timestamp: string } - | { type: 'readiness_end'; task: AgentTaskSpec; knowledge: KnowledgeReadinessReport; decision: KnowledgeReadinessDecision; timestamp: string } - | { type: 'questions_start'; task: AgentTaskSpec; questions: UserQuestion[]; timestamp: string } - | { type: 'questions_end'; task: AgentTaskSpec; questions: UserQuestion[]; userAnswers: Record; timestamp: string } - | { type: 'acquisition_start'; task: AgentTaskSpec; acquisitionPlans: DataAcquisitionPlan[]; timestamp: string } - | { type: 'acquisition_end'; task: AgentTaskSpec; acquisitionPlans: DataAcquisitionPlan[]; acquiredEvidenceIds: string[]; timestamp: string } - | { type: 'session_created'; task: AgentTaskSpec; session: RuntimeSession; timestamp: string } - | { type: 'session_resumed'; task: AgentTaskSpec; session: RuntimeSession; timestamp: string } - | { type: 'backend_start'; task: AgentTaskSpec; session: RuntimeSession; backend: string; timestamp: string } - | { type: 'text_delta'; task?: AgentTaskSpec; session?: RuntimeSession; text: string; timestamp?: string } - | { type: 'reasoning_delta'; task?: AgentTaskSpec; session?: RuntimeSession; text: string; timestamp?: string } - | { type: 'tool_call'; task?: AgentTaskSpec; session?: RuntimeSession; toolName: string; toolCallId?: string; args?: unknown; timestamp?: string } - | { type: 'tool_result'; task?: AgentTaskSpec; session?: RuntimeSession; toolName: string; toolCallId?: string; result?: unknown; timestamp?: string } - | { type: 'artifact'; task?: AgentTaskSpec; session?: RuntimeSession; artifactId: string; name?: string; mimeType?: string; uri?: string; metadata?: Record; timestamp?: string } - | { type: 'backend_error'; task: AgentTaskSpec; session?: RuntimeSession; backend: string; message: string; recoverable: boolean; timestamp: string } - | { type: 'backend_end'; task: AgentTaskSpec; session: RuntimeSession; backend: string; timestamp: string } - | { type: 'task_end'; task: AgentTaskSpec; status: AgentTaskStatus; reason: string; timestamp: string } - | { type: 'final'; task: AgentTaskSpec; session?: RuntimeSession; status: AgentTaskStatus; reason: string; text?: string; metadata?: Record; timestamp: string } - -export interface RuntimeSession { - id: string - backend: string - status: 'active' | 'completed' | 'failed' | 'aborted' - resumeToken?: string - createdAt: string - updatedAt: string - metadata?: Record -} - -export interface RuntimeSessionStore { - get(sessionId: string): Promise | RuntimeSession | undefined - put(session: RuntimeSession): Promise | void - appendEvent?(sessionId: string, event: RuntimeStreamEvent): Promise | void - listEvents?(sessionId: string): Promise | RuntimeStreamEvent[] -} - -export interface AgentBackendInput { - task: AgentTaskSpec - message?: string - messages?: Array<{ role: string; content: string }> - inputs?: Record -} - -export interface AgentBackendContext { - task: AgentTaskSpec - knowledge: KnowledgeReadinessReport - session: RuntimeSession - signal?: AbortSignal -} - -export interface AgentExecutionBackend { - kind: string - start?(input: TInput, context: Omit & { requestedSessionId?: string }): Promise | RuntimeSession - resume?(session: RuntimeSession, input: TInput, context: Omit): Promise | RuntimeSession - stream(input: TInput, context: AgentBackendContext): AsyncIterable - stop?(session: RuntimeSession, reason: string): Promise | void -} - -export interface RunAgentTaskStreamOptions { - task: AgentTaskSpec - backend: AgentExecutionBackend - input?: Omit - knowledge?: AgentKnowledgeProvider - sessionStore?: RuntimeSessionStore - sessionId?: string - resume?: boolean - signal?: AbortSignal - minimumReadinessScore?: number -} - -export interface RunAgentTaskOptions { - task: AgentTaskSpec - adapter: AgentAdapter - knowledge?: AgentKnowledgeProvider - onEvent?: AgentRuntimeEventSink - store?: TraceStore - signal?: AbortSignal - scenarioId?: string - projectId?: string - variantId?: string - minimumReadinessScore?: number -} - -export interface AgentTaskRunResult { - task: AgentTaskSpec - status: AgentTaskStatus - knowledge: KnowledgeReadinessReport - questions: UserQuestion[] - acquisitionPlans: DataAcquisitionPlan[] - userAnswers: Record - acquiredEvidenceIds: string[] - control: ControlRunResult - runRecords: RunRecord[] -} - -export interface RuntimeTelemetryOptions { - /** - * Include raw task inputs. Off by default because task inputs often - * contain customer facts, credentials, source text, or internal IDs. - */ - includeInputs?: boolean - /** Include requirement descriptions. Secret requirements are always redacted. */ - includeRequirementDescriptions?: boolean - /** Include evidence IDs. Off by default; counts are safer for shared reports. */ - includeEvidenceIds?: boolean - /** Include user answers from question preflight. Off by default. */ - includeUserAnswers?: boolean - /** Include action payloads and action results for control steps. Off by default. */ - includeControlPayloads?: boolean - /** Include task metadata. Off by default because metadata may carry IDs or policy internals. */ - includeMetadata?: boolean - /** Include eval detail/evidence strings. Off by default because validators may echo private input. */ - includeEvalDetails?: boolean -} - -export interface SanitizedKnowledgeRequirement { - id: string - description?: string - requiredFor: string[] - category: KnowledgeRequirement['category'] - acquisitionMode: KnowledgeRequirement['acquisitionMode'] - importance: KnowledgeRequirement['importance'] - freshness: KnowledgeRequirement['freshness'] - sensitivity: KnowledgeRequirement['sensitivity'] - confidenceNeeded: number - currentConfidence: number - evidenceCount: number - evidenceIds?: string[] - fallbackPolicy: KnowledgeRequirement['fallbackPolicy'] -} - -export interface SanitizedKnowledgeReadinessReport { - taskId: string - readinessScore: number - recommendedAction: KnowledgeReadinessReport['recommendedAction'] - severity: KnowledgeReadinessReport['severity'] - reason: string - blockingMissingRequirements: SanitizedKnowledgeRequirement[] - nonBlockingGaps: SanitizedKnowledgeRequirement[] - evidenceCount: number - evidenceIds?: string[] - missingRequirementIds: string[] -} - -export interface AgentTaskRunSummary { - taskId: string - domain?: string - status: AgentTaskStatus - reason: string - readinessStatus: KnowledgeReadinessDecision['status'] - readinessScore: number - recommendedAction: KnowledgeReadinessReport['recommendedAction'] - blockingGapIds: string[] - nonBlockingGapIds: string[] - questionCount: number - acquisitionPlanCount: number - acquiredEvidenceCount: number - controlStepCount: number - pass: boolean - failureClass?: string - wallMs: number - costUsd: number -} - -export interface KnowledgeReadinessDecision { - passed: boolean - status: 'ready' | 'blocked' | 'caveat' - reason: string - readinessScore: number - recommendedAction: KnowledgeReadinessReport['recommendedAction'] - severity: KnowledgeReadinessReport['severity'] - blockingGapIds: string[] - nonBlockingGapIds: string[] -} - -export interface RuntimeEventCollector { - onEvent: AgentRuntimeEventSink - events: Array> -} - -export type RuntimeStreamEventSink = (event: RuntimeStreamEvent) => void - -export interface RuntimeStreamEventSummary { - /** Total count of sanitized events collected. */ - eventCount: number - /** Count of events per `type`. Useful for log-line summaries. */ - eventCountsByType: Record - /** First session id observed in a `session_created` / `session_resumed` event, if any. */ - firstSessionId?: string - /** Last `final` event's status, if a final event was observed. */ - finalStatus?: AgentTaskStatus - /** Last `final` event's reason, if a final event was observed. */ - finalReason?: string - /** Concatenated `text_delta.text` across the stream, even when payloads are redacted. */ - finalText: string -} - -export interface RuntimeStreamEventCollector { - onEvent: RuntimeStreamEventSink - events: Array> - /** Snapshot of a small streaming-flavored summary derived from collected events. */ - summary(): RuntimeStreamEventSummary -} - -export interface ServerSentEventOptions { - event?: string - id?: string - retry?: number -} - -export class InMemoryRuntimeSessionStore implements RuntimeSessionStore { - private readonly sessions = new Map() - private readonly events = new Map() - - get(sessionId: string): RuntimeSession | undefined { - return this.sessions.get(sessionId) - } - - put(session: RuntimeSession): void { - this.sessions.set(session.id, session) - } - - appendEvent(sessionId: string, event: RuntimeStreamEvent): void { - const existing = this.events.get(sessionId) ?? [] - existing.push(event) - this.events.set(sessionId, existing) - } - - listEvents(sessionId: string): RuntimeStreamEvent[] { - return [...(this.events.get(sessionId) ?? [])] - } -} - -export async function runAgentTask( - options: RunAgentTaskOptions, -): Promise> { - const task = options.task - await emit(options.onEvent, { type: 'task_start', task }) - await emit(options.onEvent, { type: 'readiness_start', task }) - let knowledge = await buildReadiness(task, options.knowledge) - await emit(options.onEvent, { type: 'readiness_end', task, knowledge }) - const questions = userQuestionsForKnowledgeGaps(knowledge.blockingMissingRequirements) - const acquisitionPlans = acquisitionPlansForKnowledgeGaps([ - ...knowledge.blockingMissingRequirements, - ...knowledge.nonBlockingGaps, - ]) - const preflight = await runKnowledgePreflight(task, questions, acquisitionPlans, options.knowledge, options.onEvent) - if (options.knowledge?.refreshReadiness && (Object.keys(preflight.userAnswers).length > 0 || preflight.acquiredEvidenceIds.length > 0)) { - await emit(options.onEvent, { type: 'readiness_start', task }) - knowledge = await options.knowledge.refreshReadiness({ - task, - previous: knowledge, - userAnswers: preflight.userAnswers, - acquiredEvidenceIds: preflight.acquiredEvidenceIds, - }) - await emit(options.onEvent, { type: 'readiness_end', task, knowledge }) - } - - await emit(options.onEvent, { type: 'control_start', task, knowledge }) - const scenarioId = options.scenarioId ?? task.id - const control = await runAgentControlLoop({ - intent: task.intent, - budget: task.budget, - signal: options.signal, - store: options.store, - scenarioId, - projectId: options.projectId, - variantId: options.variantId, - observe: ({ history, abortSignal }) => options.adapter.observe({ task, knowledge, history, abortSignal }), - validate: async ({ state, history, abortSignal }) => { - const readinessEval = blockingKnowledgeEval(knowledge, { minimumScore: options.minimumReadinessScore }) - const evals = await options.adapter.validate({ task, knowledge, state, history, abortSignal }) - return [readinessEval as TEval, ...evals] - }, - decide: (ctx) => { - if (isKnowledgeBlocked(ctx.evals)) { - return options.adapter.onKnowledgeBlocked?.({ task, knowledge, questions, acquisitionPlans }) ?? { - type: 'stop', - pass: false, - score: knowledge.readinessScore, - reason: `knowledge readiness blocked: ${knowledge.reason}`, - } - } - return options.adapter.decide(toAgentContext(task, knowledge, ctx)) - }, - act: (action, ctx) => options.adapter.act(action, toAgentContext(task, knowledge, ctx)), - shouldStop: options.adapter.shouldStop - ? (ctx) => options.adapter.shouldStop!(toAgentContext(task, knowledge, ctx)) - : undefined, - getActionCostUsd: options.adapter.getActionCostUsd - ? ({ action, result, state, evals, history }) => options.adapter.getActionCostUsd!({ action, result, task, state, evals, history }) - : undefined, - onStep: (step) => emit(options.onEvent, { type: 'control_step', task, step }), - }) - await emit(options.onEvent, { type: 'control_end', task, control }) - const status = statusFromControl(control) - await emit(options.onEvent, { type: 'task_end', task, status, reason: control.reason }) - - return { - task, - status, - knowledge, - questions, - acquisitionPlans, - userAnswers: preflight.userAnswers, - acquiredEvidenceIds: preflight.acquiredEvidenceIds, - control, - runRecords: (options.adapter.projectRunRecords?.(control, task) ?? []).map((record) => ( - record.scenarioId === undefined ? { ...record, scenarioId } : record - )), - } -} - -export function summarizeAgentTaskRun( - result: AgentTaskRunResult, -): AgentTaskRunSummary { - return { - taskId: result.task.id, - domain: result.task.domain, - status: result.status, - reason: result.control.reason, - readinessStatus: decideKnowledgeReadiness(result.knowledge).status, - readinessScore: result.knowledge.readinessScore, - recommendedAction: result.knowledge.recommendedAction, - blockingGapIds: result.knowledge.blockingMissingRequirements.map((requirement) => requirement.id), - nonBlockingGapIds: result.knowledge.nonBlockingGaps.map((requirement) => requirement.id), - questionCount: result.questions.length, - acquisitionPlanCount: result.acquisitionPlans.length, - acquiredEvidenceCount: result.acquiredEvidenceIds.length, - controlStepCount: result.control.steps.length, - pass: result.control.pass, - failureClass: result.control.failureClass, - wallMs: result.control.wallMs, - costUsd: result.control.spentCostUsd, - } -} - -export async function* runAgentTaskStream( - options: RunAgentTaskStreamOptions, -): AsyncIterable { - const task = options.task - const input = { task, ...(options.input ?? {}) } as TInput - const started = streamEvent({ type: 'task_start', task }) - yield started - - const readinessStart = streamEvent({ type: 'readiness_start', task }) - yield readinessStart - let knowledge = await buildReadiness(task, options.knowledge) - const questions = userQuestionsForKnowledgeGaps(knowledge.blockingMissingRequirements) - const acquisitionPlans = acquisitionPlansForKnowledgeGaps([ - ...knowledge.blockingMissingRequirements, - ...knowledge.nonBlockingGaps, - ]) - const preflight = await runKnowledgePreflightStream(task, questions, acquisitionPlans, options.knowledge) - for (const event of preflight.events) yield event - if (options.knowledge?.refreshReadiness && (Object.keys(preflight.userAnswers).length > 0 || preflight.acquiredEvidenceIds.length > 0)) { - yield streamEvent({ type: 'readiness_start', task }) - knowledge = await options.knowledge.refreshReadiness({ - task, - previous: knowledge, - userAnswers: preflight.userAnswers, - acquiredEvidenceIds: preflight.acquiredEvidenceIds, - }) - } - const decision = decideKnowledgeReadiness(knowledge, { minimumScore: options.minimumReadinessScore }) - yield streamEvent({ type: 'readiness_end', task, knowledge, decision }) - if (!decision.passed && decision.status === 'blocked') { - const reason = `knowledge readiness blocked: ${decision.reason}` - yield streamEvent({ type: 'task_end', task, status: 'blocked', reason }) - yield streamEvent({ type: 'final', task, status: 'blocked', reason }) - return - } - - const store = options.sessionStore - const existing = options.sessionId ? await store?.get(options.sessionId) : undefined - const shouldResume = Boolean(options.resume && existing) - let session = shouldResume && existing - ? await resumeBackendSession(options.backend, existing, input, { task, knowledge, signal: options.signal }) - : await startBackendSession(options.backend, input, { task, knowledge, signal: options.signal }, options.sessionId) - await store?.put(session) - const sessionEvent = streamEvent({ - type: shouldResume ? 'session_resumed' : 'session_created', - task, - session, - }) - await store?.appendEvent?.(session.id, sessionEvent) - yield sessionEvent - - const backendStart = streamEvent({ type: 'backend_start', task, session, backend: options.backend.kind }) - await store?.appendEvent?.(session.id, backendStart) - yield backendStart - - let finalText = '' - try { - for await (const rawEvent of options.backend.stream(input, { task, knowledge, session, signal: options.signal })) { - const event = normalizeBackendStreamEvent(rawEvent, task, session) - if (event.type === 'text_delta') finalText += event.text - await store?.appendEvent?.(session.id, event) - yield event - } - const completedStatus: AgentTaskStatus = 'completed' - session = touchSession({ ...session, status: completedStatus }) - await store?.put(session) - const backendEnd = streamEvent({ type: 'backend_end', task, session, backend: options.backend.kind }) - await store?.appendEvent?.(session.id, backendEnd) - yield backendEnd - const reason = 'backend completed' - const taskEnd = streamEvent({ type: 'task_end', task, status: completedStatus, reason }) - await store?.appendEvent?.(session.id, taskEnd) - yield taskEnd - const final = streamEvent({ type: 'final', task, session, status: completedStatus, reason, text: finalText || undefined }) - await store?.appendEvent?.(session.id, final) - yield final - } catch (err) { - const message = err instanceof Error ? err.message : String(err) - session = touchSession({ ...session, status: options.signal?.aborted ? 'aborted' : 'failed' }) - await store?.put(session) - let stopErrorMessage: string | undefined - try { - await options.backend.stop?.(session, message) - } catch (stopErr) { - stopErrorMessage = stopErr instanceof Error ? stopErr.message : String(stopErr) - } - const backendError = streamEvent({ - type: 'backend_error', - task, - session, - backend: options.backend.kind, - message: stopErrorMessage ? `${message}; backend stop failed: ${stopErrorMessage}` : message, - recoverable: !options.signal?.aborted, - }) - await store?.appendEvent?.(session.id, backendError) - yield backendError - const status: AgentTaskStatus = options.signal?.aborted ? 'aborted' : 'failed' - const taskEnd = streamEvent({ type: 'task_end', task, status, reason: message }) - await store?.appendEvent?.(session.id, taskEnd) - yield taskEnd - const final = streamEvent({ type: 'final', task, session, status, reason: message, text: finalText || undefined }) - await store?.appendEvent?.(session.id, final) - yield final - } -} - -export function decideKnowledgeReadiness( - report: KnowledgeReadinessReport, - options: { minimumScore?: number } = {}, -): KnowledgeReadinessDecision { - const minimumScore = options.minimumScore ?? 0.7 - const blockingGapIds = report.blockingMissingRequirements.map((requirement) => requirement.id) - const nonBlockingGapIds = report.nonBlockingGaps.map((requirement) => requirement.id) - if (blockingGapIds.length > 0) { - return { - passed: false, - status: 'blocked', - reason: report.reason, - readinessScore: report.readinessScore, - recommendedAction: report.recommendedAction, - severity: report.severity, - blockingGapIds, - nonBlockingGapIds, - } - } - if (report.readinessScore < minimumScore) { - return { - passed: false, - status: 'caveat', - reason: `Knowledge readiness score ${report.readinessScore.toFixed(3)} is below minimum ${minimumScore.toFixed(3)}.`, - readinessScore: report.readinessScore, - recommendedAction: report.recommendedAction, - severity: report.severity, - blockingGapIds, - nonBlockingGapIds, - } - } - return { - passed: true, - status: 'ready', - reason: report.reason, - readinessScore: report.readinessScore, - recommendedAction: report.recommendedAction, - severity: report.severity, - blockingGapIds, - nonBlockingGapIds, - } -} - -export function sanitizeKnowledgeReadinessReport( - report: KnowledgeReadinessReport, - options: RuntimeTelemetryOptions = {}, -): SanitizedKnowledgeReadinessReport { - return { - taskId: report.taskId, - readinessScore: report.readinessScore, - recommendedAction: report.recommendedAction, - severity: report.severity, - reason: report.reason, - blockingMissingRequirements: report.blockingMissingRequirements.map((requirement) => - sanitizeKnowledgeRequirement(requirement, options), - ), - nonBlockingGaps: report.nonBlockingGaps.map((requirement) => - sanitizeKnowledgeRequirement(requirement, options), - ), - evidenceCount: report.bundle.evidenceIds.length, - evidenceIds: options.includeEvidenceIds ? report.bundle.evidenceIds : undefined, - missingRequirementIds: report.bundle.missing.map((requirement) => requirement.id), - } -} - -export function sanitizeAgentRuntimeEvent( - event: AgentRuntimeEvent, - options: RuntimeTelemetryOptions = {}, -): Record { - const base = { type: event.type, task: sanitizeTask(event.task, options) } - if (event.type === 'readiness_start' || event.type === 'task_start' || event.type === 'control_start') { - return event.type === 'control_start' - ? { ...base, knowledge: sanitizeKnowledgeReadinessReport(event.knowledge, options) } - : base - } - if (event.type === 'readiness_end') { - return { ...base, knowledge: sanitizeKnowledgeReadinessReport(event.knowledge, options) } - } - if (event.type === 'questions_start') { - return { ...base, questions: event.questions.map((question) => sanitizeQuestion(question, options)) } - } - if (event.type === 'questions_end') { - return { - ...base, - questions: event.questions.map((question) => sanitizeQuestion(question, options)), - userAnswers: options.includeUserAnswers ? event.userAnswers : redactRecord(event.userAnswers), - } - } - if (event.type === 'acquisition_start') { - return { ...base, acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan) } - } - if (event.type === 'acquisition_end') { - return { - ...base, - acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan), - acquiredEvidenceCount: event.acquiredEvidenceIds.length, - acquiredEvidenceIds: options.includeEvidenceIds ? event.acquiredEvidenceIds : undefined, - } - } - if (event.type === 'control_step') { - return { ...base, step: sanitizeControlStep(event.step, options) } - } - if (event.type === 'control_end') { - return { ...base, control: sanitizeControlRun(event.control, options) } - } - return { ...base, status: event.status, reason: event.reason } -} - -export function sanitizeRuntimeStreamEvent( - event: RuntimeStreamEvent, - options: RuntimeTelemetryOptions = {}, -): Record { - const withTask = 'task' in event && event.task - ? { task: sanitizeTask(event.task, options) } - : {} - const withSession = 'session' in event && event.session - ? { session: sanitizeRuntimeSession(event.session, options) } - : {} - - if (event.type === 'readiness_end') { - return { - type: event.type, - ...withTask, - timestamp: event.timestamp, - decision: event.decision, - knowledge: sanitizeKnowledgeReadinessReport(event.knowledge, options), - } - } - if (event.type === 'questions_start') { - return { type: event.type, ...withTask, timestamp: event.timestamp, questions: event.questions.map((question) => sanitizeQuestion(question, options)) } - } - if (event.type === 'questions_end') { - return { - type: event.type, - ...withTask, - timestamp: event.timestamp, - questions: event.questions.map((question) => sanitizeQuestion(question, options)), - userAnswers: options.includeUserAnswers ? event.userAnswers : redactRecord(event.userAnswers), - } - } - if (event.type === 'acquisition_start') { - return { type: event.type, ...withTask, timestamp: event.timestamp, acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan) } - } - if (event.type === 'acquisition_end') { - return { - type: event.type, - ...withTask, - timestamp: event.timestamp, - acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan), - acquiredEvidenceCount: event.acquiredEvidenceIds.length, - acquiredEvidenceIds: options.includeEvidenceIds ? event.acquiredEvidenceIds : undefined, - } - } - if (event.type === 'tool_call') { - return { - type: event.type, - ...withTask, - ...withSession, - timestamp: event.timestamp, - toolName: event.toolName, - toolCallId: event.toolCallId, - args: options.includeControlPayloads ? event.args : undefined, - } - } - if (event.type === 'tool_result') { - return { - type: event.type, - ...withTask, - ...withSession, - timestamp: event.timestamp, - toolName: event.toolName, - toolCallId: event.toolCallId, - result: options.includeControlPayloads ? event.result : undefined, - } - } - if (event.type === 'artifact') { - return { - type: event.type, - ...withTask, - ...withSession, - timestamp: event.timestamp, - artifactId: event.artifactId, - name: event.name, - mimeType: event.mimeType, - uri: options.includeEvidenceIds ? event.uri : undefined, - metadata: options.includeMetadata ? event.metadata : undefined, - } - } - if (event.type === 'final') { - return { - type: event.type, - ...withTask, - ...withSession, - timestamp: event.timestamp, - status: event.status, - reason: event.reason, - text: options.includeControlPayloads ? event.text : undefined, - metadata: options.includeMetadata ? event.metadata : undefined, - } - } - return { - type: event.type, - ...withTask, - ...withSession, - timestamp: 'timestamp' in event ? event.timestamp : undefined, - ...pickPublicStreamFields(event), - } -} - -export function createRuntimeEventCollector( - options: RuntimeTelemetryOptions = {}, -): RuntimeEventCollector { - const events: Array> = [] - return { - events, - onEvent: (event) => { - events.push(sanitizeAgentRuntimeEvent(event, options)) - }, - } -} - /** - * Streaming-event counterpart of `createRuntimeEventCollector`. Use this with - * `runAgentTaskStream` — pass each yielded event through `onEvent` and read - * the sanitized copies off `events`. The same `RuntimeTelemetryOptions` - * redaction flags apply. + * @tangle-network/agent-runtime + * + * Reusable runtime lifecycle for domain-specific agents. Standardizes the + * task lifecycle (knowledge readiness → questions / acquisition → control + * loop → eval) and delegates domain behavior to an adapter. Owns no domain + * policy, models, tools, connectors, or UI. * - * Stream and non-stream events have different field shapes (timestamps, - * sessions, text/tool deltas) so this is a sibling factory rather than an - * overload of `createRuntimeEventCollector`; the unified-union alternative - * was rejected because dispatching on `type` alone would silently misroute - * events whose `type` literals overlap (`task_start`, `readiness_end`, etc.). + * See `docs/concepts.md` (mental model) and `README.md` (quickstart). Every + * public export below carries a `@stable` or `@experimental` tag; treat + * `@experimental` exports as subject to change inside this minor. */ -export function createRuntimeStreamEventCollector( - options: RuntimeTelemetryOptions = {}, -): RuntimeStreamEventCollector { - const events: Array> = [] - const eventCountsByType: Record = {} - let firstSessionId: string | undefined - let finalStatus: AgentTaskStatus | undefined - let finalReason: string | undefined - let finalText = '' - return { - events, - onEvent: (event) => { - events.push(sanitizeRuntimeStreamEvent(event, options)) - eventCountsByType[event.type] = (eventCountsByType[event.type] ?? 0) + 1 - if (event.type === 'text_delta') finalText += event.text - if (!firstSessionId && (event.type === 'session_created' || event.type === 'session_resumed')) { - firstSessionId = event.session.id - } - if (event.type === 'final') { - finalStatus = event.status - finalReason = event.reason - } - }, - summary() { - return { - eventCount: events.length, - eventCountsByType: { ...eventCountsByType }, - firstSessionId, - finalStatus, - finalReason, - finalText, - } - }, - } -} - -export function encodeServerSentEvent( - data: unknown, - options: ServerSentEventOptions = {}, -): string { - const lines: string[] = [] - if (options.id) lines.push(`id: ${stripNewlines(options.id)}`) - if (options.event) lines.push(`event: ${stripNewlines(options.event)}`) - if (typeof options.retry === 'number' && Number.isFinite(options.retry) && options.retry >= 0) { - lines.push(`retry: ${Math.floor(options.retry)}`) - } - - const payload = typeof data === 'string' ? data : JSON.stringify(data) - for (const line of payload.split(/\r?\n/)) { - lines.push(`data: ${line}`) - } - return `${lines.join('\n')}\n\n` -} - -export function readinessServerSentEvent( - report: KnowledgeReadinessReport, - options: RuntimeTelemetryOptions & ServerSentEventOptions = {}, -): string { - const { event, id, retry, ...telemetryOptions } = options - return encodeServerSentEvent({ - type: 'readiness', - readiness: sanitizeKnowledgeReadinessReport(report, telemetryOptions), - }, { event, id, retry }) -} - -export function runtimeStreamServerSentEvent( - event: RuntimeStreamEvent, - options: RuntimeTelemetryOptions & ServerSentEventOptions = {}, -): string { - const { event: sseEvent, id, retry, ...telemetryOptions } = options - return encodeServerSentEvent(sanitizeRuntimeStreamEvent(event, telemetryOptions), { event: sseEvent, id, retry }) -} - -export function createIterableBackend( - options: { - kind: string - start?: AgentExecutionBackend['start'] - resume?: AgentExecutionBackend['resume'] - stream: AgentExecutionBackend['stream'] - stop?: AgentExecutionBackend['stop'] - }, -): AgentExecutionBackend { - return options -} - -export function createSandboxPromptBackend( - options: { - kind?: string - getBox(input: TInput, context: Omit): Promise | TBox - streamPrompt(box: TBox, message: string, context: AgentBackendContext): AsyncIterable - mapEvent?: (event: unknown, context: AgentBackendContext) => RuntimeStreamEvent | undefined - getSessionId?: (box: TBox, input: TInput) => string | undefined - }, -): AgentExecutionBackend { - return { - kind: options.kind ?? 'sandbox', - async start(input, context) { - const box = await options.getBox(input, context) - return newRuntimeSession(options.kind ?? 'sandbox', options.getSessionId?.(box, input) ?? context.requestedSessionId, { - resumable: true, - }) - }, - resume(session) { - return touchSession({ ...session, status: 'active' }) - }, - async *stream(input, context) { - const box = await options.getBox(input, context) - const message = input.message ?? input.messages?.at(-1)?.content ?? context.task.intent - for await (const event of options.streamPrompt(box, message, context)) { - const mapped = options.mapEvent?.(event, context) ?? mapCommonBackendEvent(event, context) - if (mapped) yield mapped - } - }, - } -} - -export function createOpenAICompatibleBackend( - options: { - apiKey: string - baseUrl: string - model: string - kind?: string - fetchImpl?: typeof fetch - }, -): AgentExecutionBackend { - const fetcher = options.fetchImpl ?? fetch - return { - kind: options.kind ?? 'tcloud', - start(_input, context) { - return newRuntimeSession(options.kind ?? 'tcloud', context.requestedSessionId) - }, - async *stream(input, context) { - const response = await fetcher(`${options.baseUrl.replace(/\/$/, '')}/chat/completions`, { - method: 'POST', - headers: { - Authorization: `Bearer ${options.apiKey}`, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - model: options.model, - stream: true, - messages: input.messages ?? [{ role: 'user', content: input.message ?? context.task.intent }], - }), - signal: context.signal, - }) - if (!response.ok) throw new Error(`chat backend returned ${response.status}`) - yield* streamResponseEvents(response, context) - }, - } -} - -async function runKnowledgePreflight( - task: AgentTaskSpec, - questions: UserQuestion[], - acquisitionPlans: DataAcquisitionPlan[], - provider: AgentKnowledgeProvider | undefined, - onEvent: AgentRuntimeEventSink | undefined, -): Promise<{ userAnswers: Record; acquiredEvidenceIds: string[] }> { - let userAnswers: Record = {} - let acquiredEvidenceIds: string[] = [] - if (questions.length > 0 && provider?.answerQuestions) { - await emit(onEvent, { type: 'questions_start', task, questions }) - userAnswers = await provider.answerQuestions(questions, task) - await emit(onEvent, { type: 'questions_end', task, questions, userAnswers }) - } - if (acquisitionPlans.length > 0 && provider?.executeAcquisitionPlans) { - await emit(onEvent, { type: 'acquisition_start', task, acquisitionPlans }) - acquiredEvidenceIds = await provider.executeAcquisitionPlans(acquisitionPlans, task) - await emit(onEvent, { type: 'acquisition_end', task, acquisitionPlans, acquiredEvidenceIds }) - } - return { userAnswers, acquiredEvidenceIds } -} - -async function runKnowledgePreflightStream( - task: AgentTaskSpec, - questions: UserQuestion[], - acquisitionPlans: DataAcquisitionPlan[], - provider: AgentKnowledgeProvider | undefined, -): Promise<{ - userAnswers: Record - acquiredEvidenceIds: string[] - events: RuntimeStreamEvent[] -}> { - const events: RuntimeStreamEvent[] = [] - let userAnswers: Record = {} - let acquiredEvidenceIds: string[] = [] - if (questions.length > 0 && provider?.answerQuestions) { - events.push(streamEvent({ type: 'questions_start', task, questions })) - userAnswers = await provider.answerQuestions(questions, task) - events.push(streamEvent({ type: 'questions_end', task, questions, userAnswers })) - } - if (acquisitionPlans.length > 0 && provider?.executeAcquisitionPlans) { - events.push(streamEvent({ type: 'acquisition_start', task, acquisitionPlans })) - acquiredEvidenceIds = await provider.executeAcquisitionPlans(acquisitionPlans, task) - events.push(streamEvent({ type: 'acquisition_end', task, acquisitionPlans, acquiredEvidenceIds })) - } - return { userAnswers, acquiredEvidenceIds, events } -} - -function sanitizeTask(task: AgentTaskSpec, options: RuntimeTelemetryOptions): Record { - return { - id: task.id, - intent: task.intent, - domain: task.domain, - inputs: options.includeInputs ? task.inputs : task.inputs ? '[redacted]' : undefined, - requiredKnowledge: task.requiredKnowledge?.map((requirement) => - sanitizeKnowledgeRequirement(requirement, options), - ), - metadata: options.includeMetadata ? task.metadata : task.metadata ? '[redacted]' : undefined, - } -} - -function sanitizeRuntimeSession(session: RuntimeSession, options: RuntimeTelemetryOptions): Record { - return { - id: session.id, - backend: session.backend, - status: session.status, - hasResumeToken: Boolean(session.resumeToken), - createdAt: session.createdAt, - updatedAt: session.updatedAt, - metadata: options.includeMetadata ? session.metadata : session.metadata ? '[redacted]' : undefined, - } -} - -function sanitizeKnowledgeRequirement( - requirement: KnowledgeRequirement, - options: RuntimeTelemetryOptions, -): SanitizedKnowledgeRequirement { - const includeDescription = options.includeRequirementDescriptions && requirement.sensitivity !== 'secret' - return { - id: requirement.id, - description: includeDescription ? requirement.description : undefined, - requiredFor: requirement.requiredFor, - category: requirement.category, - acquisitionMode: requirement.acquisitionMode, - importance: requirement.importance, - freshness: requirement.freshness, - sensitivity: requirement.sensitivity, - confidenceNeeded: requirement.confidenceNeeded, - currentConfidence: requirement.currentConfidence, - evidenceCount: requirement.evidenceIds.length, - evidenceIds: options.includeEvidenceIds ? requirement.evidenceIds : undefined, - fallbackPolicy: requirement.fallbackPolicy, - } -} - -function sanitizeQuestion(question: UserQuestion, options: RuntimeTelemetryOptions): Record { - return { - id: question.id, - question: options.includeRequirementDescriptions && question.answerType !== 'credential' - ? question.question - : undefined, - reason: options.includeRequirementDescriptions ? question.reason : undefined, - requirementId: question.requirementId, - importance: question.importance, - answerType: question.answerType, - impactIfUnknown: options.includeRequirementDescriptions ? question.impactIfUnknown : undefined, - optionCount: question.options?.length ?? 0, - } -} - -function sanitizeAcquisitionPlan(plan: DataAcquisitionPlan): Record { - return { - id: plan.id, - requirementIds: plan.requirementIds, - mode: plan.mode, - priority: plan.priority, - expectedEvidenceCount: plan.expectedEvidenceIds?.length ?? 0, - questionCount: plan.questions?.length ?? 0, - } -} - -function sanitizeControlStep( - step: ControlStep, - options: RuntimeTelemetryOptions, -): Record { - const actionOutcome = step.actionOutcome - return { - index: step.index, - decisionType: step.decision.type, - reason: step.decision.reason, - action: options.includeControlPayloads && step.decision.type === 'continue' ? step.decision.action : undefined, - result: options.includeControlPayloads && actionOutcome?.ok ? actionOutcome.result : undefined, - actionOk: actionOutcome?.ok, - actionError: actionOutcome?.ok === false ? actionOutcome.error : undefined, - durationMs: actionOutcome?.durationMs, - evalsBefore: summarizeEvals(step.evalsBefore, options), - evalsAfter: summarizeEvals(step.evalsAfter, options), - startedAt: step.startedAt, - endedAt: step.endedAt, - } -} - -function sanitizeControlRun( - control: ControlRunResult, - options: RuntimeTelemetryOptions, -): Record { - return { - pass: control.pass, - completed: control.completed, - reason: control.reason, - score: control.score, - stepCount: control.steps.length, - wallMs: control.wallMs, - spentCostUsd: control.spentCostUsd, - failureClass: control.failureClass, - stoppedBy: control.stoppedBy, - runId: control.runId, - runtimeErrorCount: control.runtimeErrors.length, - finalEvals: summarizeEvals(control.finalEvals, options), - } -} - -function summarizeEvals(evals: ControlEvalResult[], options: RuntimeTelemetryOptions): Array> { - return evals.map((evalResult) => ({ - id: evalResult.id, - passed: evalResult.passed, - score: evalResult.score, - severity: evalResult.severity, - objective: evalResult.objective, - detail: options.includeEvalDetails ? evalResult.detail : undefined, - evidence: options.includeEvalDetails ? evalResult.evidence : undefined, - })) -} - -function redactRecord(record: Record): Record { - return Object.fromEntries(Object.keys(record).map((key) => [key, '[redacted]'])) -} - -function stripNewlines(value: string): string { - return value.replace(/[\r\n]/g, ' ') -} - -function timestamp(): string { - return new Date().toISOString() -} - -function streamEvent>(event: T): T & { timestamp: string } { - return { ...event, timestamp: timestamp() } -} - -function newRuntimeSession(backend: string, requestedId?: string, metadata?: Record): RuntimeSession { - const now = timestamp() - return { - id: requestedId || crypto.randomUUID(), - backend, - status: 'active', - createdAt: now, - updatedAt: now, - metadata, - } -} - -function touchSession(session: RuntimeSession): RuntimeSession { - return { ...session, updatedAt: timestamp() } -} - -async function startBackendSession( - backend: AgentExecutionBackend, - input: TInput, - context: Omit, - requestedSessionId?: string, -): Promise { - if (backend.start) return backend.start(input, { ...context, requestedSessionId }) - return newRuntimeSession(backend.kind, requestedSessionId) -} - -async function resumeBackendSession( - backend: AgentExecutionBackend, - session: RuntimeSession, - input: TInput, - context: Omit, -): Promise { - if (session.backend !== backend.kind) { - throw new Error(`Cannot resume ${session.backend} session with ${backend.kind} backend`) - } - if (backend.resume) return backend.resume(session, input, context) - return touchSession({ ...session, status: 'active' }) -} - -function normalizeBackendStreamEvent(event: RuntimeStreamEvent, task: AgentTaskSpec, session: RuntimeSession): RuntimeStreamEvent { - if ('task' in event && event.task && 'session' in event && event.session && 'timestamp' in event && event.timestamp) return event - return { - ...event, - task: 'task' in event && event.task ? event.task : task, - session: 'session' in event && event.session ? event.session : session, - timestamp: 'timestamp' in event && event.timestamp ? event.timestamp : timestamp(), - } as RuntimeStreamEvent -} - -function pickPublicStreamFields(event: RuntimeStreamEvent): Record { - if (event.type === 'session_created' || event.type === 'session_resumed') return {} - if (event.type === 'backend_start' || event.type === 'backend_end') return { backend: event.backend } - if (event.type === 'backend_error') return { backend: event.backend, message: event.message, recoverable: event.recoverable } - if (event.type === 'task_end') return { status: event.status, reason: event.reason } - if (event.type === 'text_delta' || event.type === 'reasoning_delta') return { text: event.text } - return {} -} - -function mapCommonBackendEvent(event: unknown, context: AgentBackendContext): RuntimeStreamEvent | undefined { - if (!event || typeof event !== 'object') return undefined - const record = event as Record - const type = String(record.type ?? '') - const data = record.data && typeof record.data === 'object' ? record.data as Record : record - if (type === 'message.part.updated' || type === 'text_delta' || type === 'delta') { - const text = stringValue(data.text) ?? stringValue(data.delta) ?? stringValue(record.text) - return text ? { type: 'text_delta', task: context.task, session: context.session, text, timestamp: timestamp() } : undefined - } - if (type === 'reasoning_delta') { - const text = stringValue(data.text) ?? stringValue(record.text) - return text ? { type: 'reasoning_delta', task: context.task, session: context.session, text, timestamp: timestamp() } : undefined - } - if (type === 'tool_call') { - return { - type: 'tool_call', - task: context.task, - session: context.session, - toolName: stringValue(data.name) ?? stringValue(record.toolName) ?? 'tool', - toolCallId: stringValue(data.id) ?? stringValue(record.toolCallId), - args: data.args ?? data.input ?? record.args, - timestamp: timestamp(), - } - } - if (type === 'tool_result') { - return { - type: 'tool_result', - task: context.task, - session: context.session, - toolName: stringValue(data.name) ?? stringValue(record.toolName) ?? 'tool', - toolCallId: stringValue(data.id) ?? stringValue(record.toolCallId), - result: data.result ?? data.output ?? record.result, - timestamp: timestamp(), - } - } - if (type === 'result' || type === 'final') { - const text = stringValue(data.finalText) ?? stringValue(data.text) ?? stringValue(record.text) - return text ? { type: 'text_delta', task: context.task, session: context.session, text, timestamp: timestamp() } : undefined - } - return undefined -} - -async function* streamResponseEvents(response: Response, context: AgentBackendContext): AsyncIterable { - const body = response.body - if (!body) return - const reader = body.getReader() - const decoder = new TextDecoder() - let buffer = '' - for (;;) { - const { done, value } = await reader.read() - if (done) break - buffer += decoder.decode(value, { stream: true }).replace(/\r\n/g, '\n') - for (const event of drainStreamBuffer(false)) yield event - } - buffer += decoder.decode().replace(/\r\n/g, '\n') - for (const event of drainStreamBuffer(true)) yield event - if (buffer.trim()) { - const event = parseStreamChunk(buffer, context) - if (event) yield event - } - - function* drainStreamBuffer(flush: boolean): Iterable { - for (;;) { - const sseBoundary = buffer.indexOf('\n\n') - if (sseBoundary >= 0) { - const chunk = buffer.slice(0, sseBoundary) - buffer = buffer.slice(sseBoundary + 2) - const event = parseStreamChunk(chunk, context) - if (event) yield event - continue - } - - const newline = buffer.indexOf('\n') - if (newline >= 0 && !buffer.slice(0, newline).startsWith('data:')) { - const line = buffer.slice(0, newline) - buffer = buffer.slice(newline + 1) - const event = parseStreamChunk(line, context) - if (event) yield event - continue - } - - if (flush && buffer.trim() && !buffer.trimStart().startsWith('data:')) { - const line = buffer - buffer = '' - const event = parseStreamChunk(line, context) - if (event) yield event - continue - } - - break - } - } -} - -function parseStreamChunk(chunk: string, context: AgentBackendContext): RuntimeStreamEvent | undefined { - const lines = chunk.split(/\r?\n/) - const dataLines = lines.filter((line) => line.startsWith('data:')) - const data = dataLines.length > 0 - ? dataLines.map((line) => line.slice(5).trimStart()).join('\n') - : chunk.trim() - if (!data || data === '[DONE]') return undefined - try { - const parsed = JSON.parse(data) as Record - const choice = Array.isArray(parsed.choices) ? parsed.choices[0] as Record | undefined : undefined - const delta = choice?.delta as Record | undefined - const message = choice?.message as Record | undefined - const text = stringValue(delta?.content) ?? stringValue(message?.content) ?? stringValue(parsed.text) - if (text) return { type: 'text_delta', task: context.task, session: context.session, text, timestamp: timestamp() } - return mapCommonBackendEvent(parsed, context) - } catch { - return { type: 'text_delta', task: context.task, session: context.session, text: data, timestamp: timestamp() } - } -} - -function stringValue(value: unknown): string | undefined { - return typeof value === 'string' && value.length > 0 ? value : undefined -} - -function buildReadiness( - task: AgentTaskSpec, - provider: AgentKnowledgeProvider | undefined, -): Promise | KnowledgeReadinessReport { - if (provider?.buildReadiness) return provider.buildReadiness(task) - return scoreKnowledgeReadiness({ - taskId: task.id, - requirements: task.requiredKnowledge ?? [], - metadata: { domain: task.domain, ...task.metadata }, - }) -} - -function isKnowledgeBlocked(evals: ControlEvalResult[]): boolean { - return evals.some((evalResult) => evalResult.id === 'knowledge-ready' && !evalResult.passed) -} - -function statusFromControl(control: ControlRunResult): AgentTaskStatus { - if (control.stoppedBy === 'abort') return 'aborted' - if (control.reason.includes('knowledge readiness blocked')) return 'blocked' - if (control.pass) return 'completed' - return 'failed' -} - -async function emit( - sink: AgentRuntimeEventSink | undefined, - event: AgentRuntimeEvent, -): Promise { - await sink?.(event) -} - -function toAgentContext( - task: AgentTaskSpec, - knowledge: KnowledgeReadinessReport, - ctx: ControlContext, -): AgentTaskContext { - return { - task, - knowledge, - state: ctx.state, - evals: ctx.evals, - history: ctx.history, - budget: ctx.budget, - stepIndex: ctx.stepIndex, - wallMs: ctx.wallMs, - spentCostUsd: ctx.spentCostUsd, - remainingCostUsd: ctx.remainingCostUsd, - abortSignal: ctx.abortSignal, - } -} +// ── Re-exports from @tangle-network/agent-eval (compat surface) ────── export type { ControlBudget, ControlDecision, @@ -1386,3 +24,89 @@ export type { RunRecord, UserQuestion, } from '@tangle-network/agent-eval' +// ── Backends ────────────────────────────────────────────────────────── +export { + createIterableBackend, + createOpenAICompatibleBackend, + createSandboxPromptBackend, +} from './backends' +// ── Errors ─────────────────────────────────────────────────────────── +export { + AgentEvalError, + type AgentEvalErrorCode, + BackendTransportError, + CaptureIntegrityError, + ConfigError, + JudgeError, + NotFoundError, + ReplayError, + RuntimeRunStateError, + SessionMismatchError, + ValidationError, + VerificationError, +} from './errors' +// ── Readiness ───────────────────────────────────────────────────────── +export { decideKnowledgeReadiness } from './readiness' +// ── Run loop ───────────────────────────────────────────────────────── +export { runAgentTask, runAgentTaskStream, summarizeAgentTaskRun } from './run' +export type { + RuntimeRunCompleteInput, + RuntimeRunCost, + RuntimeRunHandle, + RuntimeRunOptions, + RuntimeRunPersistenceAdapter, + RuntimeRunRow, + RuntimeRunStatus, +} from './runtime-run' +// ── Production run lifecycle (new in 0.7.0) ────────────────────────── +export { startRuntimeRun } from './runtime-run' +export type { + RuntimeEventCollector, + RuntimeStreamEventCollector, + RuntimeStreamEventSink, + RuntimeStreamEventSummary, + RuntimeTelemetryOptions, + SanitizedKnowledgeReadinessReport, + SanitizedKnowledgeRequirement, +} from './sanitize' +// ── Sanitization / telemetry ───────────────────────────────────────── +export { + createRuntimeEventCollector, + createRuntimeStreamEventCollector, + sanitizeAgentRuntimeEvent, + sanitizeKnowledgeReadinessReport, + sanitizeRuntimeStreamEvent, +} from './sanitize' +// ── Sessions ────────────────────────────────────────────────────────── +export { InMemoryRuntimeSessionStore } from './sessions' +export type { ServerSentEventOptions } from './sse' +// ── SSE ─────────────────────────────────────────────────────────────── +export { + encodeServerSentEvent, + readinessServerSentEvent, + runtimeStreamServerSentEvent, +} from './sse' +export type { TraceBridge, TraceBridgeOptions } from './trace-bridge' +// ── agent-eval trace bridge (new in 0.7.0) ─────────────────────────── +export { createTraceBridge, toAgentEvalTrace } from './trace-bridge' +// ── Core types ─────────────────────────────────────────────────────── +export type { + AgentAdapter, + AgentBackendContext, + AgentBackendInput, + AgentExecutionBackend, + AgentKnowledgeProvider, + AgentRuntimeEvent, + AgentRuntimeEventSink, + AgentTaskContext, + AgentTaskRunResult, + AgentTaskRunSummary, + AgentTaskSpec, + AgentTaskStatus, + KnowledgeReadinessDecision, + RunAgentTaskOptions, + RunAgentTaskStreamOptions, + RuntimeSession, + RuntimeSessionStore, + RuntimeStreamEvent, +} from './types' diff --git a/src/readiness.ts b/src/readiness.ts new file mode 100644 index 0000000..1e1a4fe --- /dev/null +++ b/src/readiness.ts @@ -0,0 +1,68 @@ +/** + * @stable + * + * Pure readiness-decision helper. Maps a `KnowledgeReadinessReport` from + * `@tangle-network/agent-eval` to a three-state branch (`ready` / `blocked` / + * `caveat`) the runtime, route handlers, and UI shells can all switch on. + * + * Default `minimumScore` of 0.7 mirrors the readiness scoring scale in + * agent-eval; callers tightening or loosening this should keep it consistent + * across all entry points for the same product so the UI / metrics agree on + * what "caveat" means. + */ + +import type { KnowledgeReadinessReport } from '@tangle-network/agent-eval' + +import { ValidationError } from './errors' +import type { KnowledgeReadinessDecision } from './types' + +const DEFAULT_MINIMUM_READINESS_SCORE = 0.7 + +/** @stable */ +export function decideKnowledgeReadiness( + report: KnowledgeReadinessReport, + options: { minimumScore?: number } = {}, +): KnowledgeReadinessDecision { + const minimumScore = options.minimumScore ?? DEFAULT_MINIMUM_READINESS_SCORE + if (!Number.isFinite(minimumScore) || minimumScore < 0 || minimumScore > 1) { + throw new ValidationError( + `minimumScore must be a finite number in [0, 1]; received ${String(minimumScore)}`, + ) + } + const blockingGapIds = report.blockingMissingRequirements.map((requirement) => requirement.id) + const nonBlockingGapIds = report.nonBlockingGaps.map((requirement) => requirement.id) + if (blockingGapIds.length > 0) { + return { + passed: false, + status: 'blocked', + reason: report.reason, + readinessScore: report.readinessScore, + recommendedAction: report.recommendedAction, + severity: report.severity, + blockingGapIds, + nonBlockingGapIds, + } + } + if (report.readinessScore < minimumScore) { + return { + passed: false, + status: 'caveat', + reason: `Knowledge readiness score ${report.readinessScore.toFixed(3)} is below minimum ${minimumScore.toFixed(3)}.`, + readinessScore: report.readinessScore, + recommendedAction: report.recommendedAction, + severity: report.severity, + blockingGapIds, + nonBlockingGapIds, + } + } + return { + passed: true, + status: 'ready', + reason: report.reason, + readinessScore: report.readinessScore, + recommendedAction: report.recommendedAction, + severity: report.severity, + blockingGapIds, + nonBlockingGapIds, + } +} diff --git a/src/run.ts b/src/run.ts new file mode 100644 index 0000000..6b1271a --- /dev/null +++ b/src/run.ts @@ -0,0 +1,484 @@ +/** + * @stable + * + * The two top-level entry points: + * + * - `runAgentTask` — single-shot lifecycle for adapter-driven tasks. + * - `runAgentTaskStream` — streaming lifecycle that delegates execution to an + * `AgentExecutionBackend` (model API, sandbox, or custom iterable). + * + * Both gate the run on `KnowledgeReadinessReport` from `agent-eval`, emit the + * same lifecycle event vocabulary (under different shapes — see `types.ts`), + * and route session lifecycle through a pluggable `RuntimeSessionStore`. + */ + +import { + acquisitionPlansForKnowledgeGaps, + blockingKnowledgeEval, + type ControlContext, + type ControlEvalResult, + type ControlRunResult, + type DataAcquisitionPlan, + type KnowledgeReadinessReport, + runAgentControlLoop, + scoreKnowledgeReadiness, + type UserQuestion, + userQuestionsForKnowledgeGaps, +} from '@tangle-network/agent-eval' + +import { normalizeBackendStreamEvent } from './backends' +import { SessionMismatchError } from './errors' +import { decideKnowledgeReadiness } from './readiness' +import { newRuntimeSession, nowIso, touchSession } from './sessions' +import type { + AgentBackendInput, + AgentExecutionBackend, + AgentKnowledgeProvider, + AgentRuntimeEventSink, + AgentTaskContext, + AgentTaskRunResult, + AgentTaskRunSummary, + AgentTaskSpec, + AgentTaskStatus, + RunAgentTaskOptions, + RunAgentTaskStreamOptions, + RuntimeSession, + RuntimeStreamEvent, +} from './types' + +/** @stable */ +export async function runAgentTask< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult = ControlEvalResult, +>( + options: RunAgentTaskOptions, +): Promise> { + const task = options.task + await emit(options.onEvent, { type: 'task_start', task }) + await emit(options.onEvent, { type: 'readiness_start', task }) + let knowledge = await buildReadiness(task, options.knowledge) + await emit(options.onEvent, { type: 'readiness_end', task, knowledge }) + const questions = userQuestionsForKnowledgeGaps(knowledge.blockingMissingRequirements) + const acquisitionPlans = acquisitionPlansForKnowledgeGaps([ + ...knowledge.blockingMissingRequirements, + ...knowledge.nonBlockingGaps, + ]) + const preflight = await runKnowledgePreflight( + task, + questions, + acquisitionPlans, + options.knowledge, + options.onEvent, + ) + if ( + options.knowledge?.refreshReadiness && + (Object.keys(preflight.userAnswers).length > 0 || preflight.acquiredEvidenceIds.length > 0) + ) { + await emit(options.onEvent, { type: 'readiness_start', task }) + knowledge = await options.knowledge.refreshReadiness({ + task, + previous: knowledge, + userAnswers: preflight.userAnswers, + acquiredEvidenceIds: preflight.acquiredEvidenceIds, + }) + await emit(options.onEvent, { type: 'readiness_end', task, knowledge }) + } + + await emit(options.onEvent, { type: 'control_start', task, knowledge }) + const scenarioId = options.scenarioId ?? task.id + const control = await runAgentControlLoop({ + intent: task.intent, + budget: task.budget, + signal: options.signal, + store: options.store, + scenarioId, + projectId: options.projectId, + variantId: options.variantId, + observe: ({ history, abortSignal }) => + options.adapter.observe({ task, knowledge, history, abortSignal }), + validate: async ({ state, history, abortSignal }) => { + const readinessEval = blockingKnowledgeEval(knowledge, { + minimumScore: options.minimumReadinessScore, + }) + const evals = await options.adapter.validate({ + task, + knowledge, + state, + history, + abortSignal, + }) + return [readinessEval as TEval, ...evals] + }, + decide: (ctx) => { + if (isKnowledgeBlocked(ctx.evals)) { + return ( + options.adapter.onKnowledgeBlocked?.({ + task, + knowledge, + questions, + acquisitionPlans, + }) ?? { + type: 'stop', + pass: false, + score: knowledge.readinessScore, + reason: `knowledge readiness blocked: ${knowledge.reason}`, + } + ) + } + return options.adapter.decide(toAgentContext(task, knowledge, ctx)) + }, + act: (action, ctx) => options.adapter.act(action, toAgentContext(task, knowledge, ctx)), + shouldStop: options.adapter.shouldStop + ? (ctx) => options.adapter.shouldStop!(toAgentContext(task, knowledge, ctx)) + : undefined, + getActionCostUsd: options.adapter.getActionCostUsd + ? ({ action, result, state, evals, history }) => + options.adapter.getActionCostUsd!({ action, result, task, state, evals, history }) + : undefined, + onStep: (step) => emit(options.onEvent, { type: 'control_step', task, step }), + }) + await emit(options.onEvent, { type: 'control_end', task, control }) + const status = statusFromControl(control) + await emit(options.onEvent, { type: 'task_end', task, status, reason: control.reason }) + + return { + task, + status, + knowledge, + questions, + acquisitionPlans, + userAnswers: preflight.userAnswers, + acquiredEvidenceIds: preflight.acquiredEvidenceIds, + control, + runRecords: (options.adapter.projectRunRecords?.(control, task) ?? []).map((record) => + record.scenarioId === undefined ? { ...record, scenarioId } : record, + ), + } +} + +/** @stable */ +export function summarizeAgentTaskRun< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult, +>(result: AgentTaskRunResult): AgentTaskRunSummary { + return { + taskId: result.task.id, + domain: result.task.domain, + status: result.status, + reason: result.control.reason, + readinessStatus: decideKnowledgeReadiness(result.knowledge).status, + readinessScore: result.knowledge.readinessScore, + recommendedAction: result.knowledge.recommendedAction, + blockingGapIds: result.knowledge.blockingMissingRequirements.map( + (requirement) => requirement.id, + ), + nonBlockingGapIds: result.knowledge.nonBlockingGaps.map((requirement) => requirement.id), + questionCount: result.questions.length, + acquisitionPlanCount: result.acquisitionPlans.length, + acquiredEvidenceCount: result.acquiredEvidenceIds.length, + controlStepCount: result.control.steps.length, + pass: result.control.pass, + failureClass: result.control.failureClass, + wallMs: result.control.wallMs, + costUsd: result.control.spentCostUsd, + } +} + +/** @stable */ +export async function* runAgentTaskStream( + options: RunAgentTaskStreamOptions, +): AsyncIterable { + const task = options.task + const input = { task, ...(options.input ?? {}) } as TInput + yield streamEvent({ type: 'task_start', task }) + + yield streamEvent({ type: 'readiness_start', task }) + let knowledge = await buildReadiness(task, options.knowledge) + const questions = userQuestionsForKnowledgeGaps(knowledge.blockingMissingRequirements) + const acquisitionPlans = acquisitionPlansForKnowledgeGaps([ + ...knowledge.blockingMissingRequirements, + ...knowledge.nonBlockingGaps, + ]) + const preflight = await runKnowledgePreflightStream( + task, + questions, + acquisitionPlans, + options.knowledge, + ) + for (const event of preflight.events) yield event + if ( + options.knowledge?.refreshReadiness && + (Object.keys(preflight.userAnswers).length > 0 || preflight.acquiredEvidenceIds.length > 0) + ) { + yield streamEvent({ type: 'readiness_start', task }) + knowledge = await options.knowledge.refreshReadiness({ + task, + previous: knowledge, + userAnswers: preflight.userAnswers, + acquiredEvidenceIds: preflight.acquiredEvidenceIds, + }) + } + const decision = decideKnowledgeReadiness(knowledge, { + minimumScore: options.minimumReadinessScore, + }) + yield streamEvent({ type: 'readiness_end', task, knowledge, decision }) + if (!decision.passed && decision.status === 'blocked') { + const reason = `knowledge readiness blocked: ${decision.reason}` + yield streamEvent({ type: 'task_end', task, status: 'blocked', reason }) + yield streamEvent({ type: 'final', task, status: 'blocked', reason }) + return + } + + const store = options.sessionStore + const existing = options.sessionId ? await store?.get(options.sessionId) : undefined + const shouldResume = Boolean(options.resume && existing) + let session = + shouldResume && existing + ? await resumeBackendSession(options.backend, existing, input, { + task, + knowledge, + signal: options.signal, + }) + : await startBackendSession( + options.backend, + input, + { task, knowledge, signal: options.signal }, + options.sessionId, + ) + await store?.put(session) + const sessionEvent = streamEvent({ + type: shouldResume ? 'session_resumed' : 'session_created', + task, + session, + }) + await store?.appendEvent?.(session.id, sessionEvent) + yield sessionEvent + + const backendStart = streamEvent({ + type: 'backend_start', + task, + session, + backend: options.backend.kind, + }) + await store?.appendEvent?.(session.id, backendStart) + yield backendStart + + let finalText = '' + try { + for await (const rawEvent of options.backend.stream(input, { + task, + knowledge, + session, + signal: options.signal, + })) { + const event = normalizeBackendStreamEvent(rawEvent, task, session) + if (event.type === 'text_delta') finalText += event.text + await store?.appendEvent?.(session.id, event) + yield event + } + const completedStatus: AgentTaskStatus = 'completed' + session = touchSession({ ...session, status: completedStatus }) + await store?.put(session) + const backendEnd = streamEvent({ + type: 'backend_end', + task, + session, + backend: options.backend.kind, + }) + await store?.appendEvent?.(session.id, backendEnd) + yield backendEnd + const reason = 'backend completed' + const taskEnd = streamEvent({ type: 'task_end', task, status: completedStatus, reason }) + await store?.appendEvent?.(session.id, taskEnd) + yield taskEnd + const final = streamEvent({ + type: 'final', + task, + session, + status: completedStatus, + reason, + text: finalText || undefined, + }) + await store?.appendEvent?.(session.id, final) + yield final + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + session = touchSession({ ...session, status: options.signal?.aborted ? 'aborted' : 'failed' }) + await store?.put(session) + let stopErrorMessage: string | undefined + try { + await options.backend.stop?.(session, message) + } catch (stopErr) { + stopErrorMessage = stopErr instanceof Error ? stopErr.message : String(stopErr) + } + const backendError = streamEvent({ + type: 'backend_error', + task, + session, + backend: options.backend.kind, + message: stopErrorMessage ? `${message}; backend stop failed: ${stopErrorMessage}` : message, + recoverable: !options.signal?.aborted, + }) + await store?.appendEvent?.(session.id, backendError) + yield backendError + const status: AgentTaskStatus = options.signal?.aborted ? 'aborted' : 'failed' + const taskEnd = streamEvent({ type: 'task_end', task, status, reason: message }) + await store?.appendEvent?.(session.id, taskEnd) + yield taskEnd + const final = streamEvent({ + type: 'final', + task, + session, + status, + reason: message, + text: finalText || undefined, + }) + await store?.appendEvent?.(session.id, final) + yield final + } +} + +async function runKnowledgePreflight< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult, +>( + task: AgentTaskSpec, + questions: UserQuestion[], + acquisitionPlans: DataAcquisitionPlan[], + provider: AgentKnowledgeProvider | undefined, + onEvent: AgentRuntimeEventSink | undefined, +): Promise<{ userAnswers: Record; acquiredEvidenceIds: string[] }> { + let userAnswers: Record = {} + let acquiredEvidenceIds: string[] = [] + if (questions.length > 0 && provider?.answerQuestions) { + await emit(onEvent, { type: 'questions_start', task, questions }) + userAnswers = await provider.answerQuestions(questions, task) + await emit(onEvent, { type: 'questions_end', task, questions, userAnswers }) + } + if (acquisitionPlans.length > 0 && provider?.executeAcquisitionPlans) { + await emit(onEvent, { type: 'acquisition_start', task, acquisitionPlans }) + acquiredEvidenceIds = await provider.executeAcquisitionPlans(acquisitionPlans, task) + await emit(onEvent, { + type: 'acquisition_end', + task, + acquisitionPlans, + acquiredEvidenceIds, + }) + } + return { userAnswers, acquiredEvidenceIds } +} + +async function runKnowledgePreflightStream( + task: AgentTaskSpec, + questions: UserQuestion[], + acquisitionPlans: DataAcquisitionPlan[], + provider: AgentKnowledgeProvider | undefined, +): Promise<{ + userAnswers: Record + acquiredEvidenceIds: string[] + events: RuntimeStreamEvent[] +}> { + const events: RuntimeStreamEvent[] = [] + let userAnswers: Record = {} + let acquiredEvidenceIds: string[] = [] + if (questions.length > 0 && provider?.answerQuestions) { + events.push(streamEvent({ type: 'questions_start', task, questions })) + userAnswers = await provider.answerQuestions(questions, task) + events.push(streamEvent({ type: 'questions_end', task, questions, userAnswers })) + } + if (acquisitionPlans.length > 0 && provider?.executeAcquisitionPlans) { + events.push(streamEvent({ type: 'acquisition_start', task, acquisitionPlans })) + acquiredEvidenceIds = await provider.executeAcquisitionPlans(acquisitionPlans, task) + events.push( + streamEvent({ type: 'acquisition_end', task, acquisitionPlans, acquiredEvidenceIds }), + ) + } + return { userAnswers, acquiredEvidenceIds, events } +} + +function streamEvent>( + event: T, +): T & { timestamp: string } { + return { ...event, timestamp: nowIso() } +} + +async function startBackendSession( + backend: AgentExecutionBackend, + input: TInput, + context: { task: AgentTaskSpec; knowledge: KnowledgeReadinessReport; signal?: AbortSignal }, + requestedSessionId?: string, +): Promise { + if (backend.start) return backend.start(input, { ...context, requestedSessionId }) + return newRuntimeSession(backend.kind, requestedSessionId) +} + +async function resumeBackendSession( + backend: AgentExecutionBackend, + session: RuntimeSession, + input: TInput, + context: { task: AgentTaskSpec; knowledge: KnowledgeReadinessReport; signal?: AbortSignal }, +): Promise { + if (session.backend !== backend.kind) { + throw new SessionMismatchError(session.backend, backend.kind) + } + if (backend.resume) return backend.resume(session, input, context) + return touchSession({ ...session, status: 'active' }) +} + +function buildReadiness( + task: AgentTaskSpec, + provider: AgentKnowledgeProvider | undefined, +): Promise | KnowledgeReadinessReport { + if (provider?.buildReadiness) return provider.buildReadiness(task) + return scoreKnowledgeReadiness({ + taskId: task.id, + requirements: task.requiredKnowledge ?? [], + metadata: { domain: task.domain, ...task.metadata }, + }) +} + +function isKnowledgeBlocked(evals: ControlEvalResult[]): boolean { + return evals.some((evalResult) => evalResult.id === 'knowledge-ready' && !evalResult.passed) +} + +function statusFromControl( + control: ControlRunResult, +): AgentTaskStatus { + if (control.stoppedBy === 'abort') return 'aborted' + if (control.reason.includes('knowledge readiness blocked')) return 'blocked' + if (control.pass) return 'completed' + return 'failed' +} + +async function emit( + sink: AgentRuntimeEventSink | undefined, + event: Parameters>[0], +): Promise { + await sink?.(event) +} + +function toAgentContext( + task: AgentTaskSpec, + knowledge: KnowledgeReadinessReport, + ctx: ControlContext, +): AgentTaskContext { + return { + task, + knowledge, + state: ctx.state, + evals: ctx.evals, + history: ctx.history, + budget: ctx.budget, + stepIndex: ctx.stepIndex, + wallMs: ctx.wallMs, + spentCostUsd: ctx.spentCostUsd, + remainingCostUsd: ctx.remainingCostUsd, + abortSignal: ctx.abortSignal, + } +} diff --git a/src/runtime-run.ts b/src/runtime-run.ts new file mode 100644 index 0000000..fe904ea --- /dev/null +++ b/src/runtime-run.ts @@ -0,0 +1,286 @@ +/** + * @stable + * + * Canonical production-run lifecycle. ONE abstraction for "the agent did a + * thing on behalf of a customer; record what it did, what it cost, and how it + * ended." Consumer agents (legal, tax, gtm, creative, agent-builder) reach for + * `startRuntimeRun` instead of inventing their own `agentRuns`-row helpers. + * + * Three concerns live in this module: + * + * 1. **Lifecycle state machine** — `running` -> `completed | failed | cancelled`, + * enforced by `RuntimeRunStateError`. Completion is idempotent (a second + * `complete()` call with the same status is a no-op so retries / cleanup + * paths don't double-fire side effects). A different terminal status is a + * state error. + * + * 2. **Cost ledger** — every `llm_call` event the handle observes contributes + * `tokensIn`, `tokensOut`, `costUsd`, and bumps `llmCalls`. Wall time is + * measured from `startRuntimeRun()` to `complete()`. Surface via + * `handle.cost()` for "cost per customer task" dashboards. + * + * 3. **Persistence adapter** — `RuntimeRunPersistenceAdapter` is the seam + * consumers plug in to write a `RuntimeRunRow` to their D1 / postgres / + * KV store. The adapter receives a sanitized row shape; no telemetry + * payload bytes flow through it unless the consumer opts in via + * `RuntimeRunOptions.telemetryEvents`. + * + * The pattern replaces legal-agent's bespoke `completeProductionAgentRun` / + * `persistRuntimeRun` pair from `eval-evidence.ts` + `api.chat.ts`. Both are + * marked `@deprecated` in this release; consumers ditch them on their own + * version bumps. + */ + +import { RuntimeRunStateError, ValidationError } from './errors' +import type { AgentTaskSpec, RuntimeStreamEvent } from './types' + +/** @stable */ +export type RuntimeRunStatus = 'running' | 'completed' | 'failed' | 'cancelled' + +/** @stable */ +export interface RuntimeRunCost { + /** Cumulative input tokens across every observed `llm_call` event. */ + tokensIn: number + /** Cumulative output tokens across every observed `llm_call` event. */ + tokensOut: number + /** Sum of `costUsd` from every observed `llm_call` event. */ + costUsd: number + /** Wall time from `startRuntimeRun()` to `complete()` (or `now()` if not yet completed). */ + wallMs: number + /** Count of `llm_call` events observed during the run. */ + llmCalls: number +} + +/** @stable */ +export interface RuntimeRunCompleteInput { + status: Exclude + resultSummary?: string + /** Optional explicit cost override; if omitted, the accumulated ledger is used. */ + cost?: Partial + /** Stable error message when `status === 'failed'`. */ + error?: string + /** Additional adapter-specific fields merged into the persisted row. */ + metadata?: Record +} + +/** @stable */ +export interface RuntimeRunRow { + /** Stable runtime-side identifier. Adapters may translate to their own primary key. */ + id: string + workspaceId: string + sessionId?: string + agentId?: string + domain?: string + taskId: string + scenarioId?: string + status: RuntimeRunStatus + resultSummary?: string + error?: string + cost: RuntimeRunCost + startedAt: string + completedAt?: string + metadata?: Record +} + +/** @stable */ +export interface RuntimeRunPersistenceAdapter { + /** + * Called once when `handle.persist()` runs. Implementations write `row` to + * their durable store (D1, postgres, KV) and return whatever the consumer + * wants the caller to see (often the storage-side row id). Errors thrown + * here propagate out of `persist()` so the caller can decide whether to + * retry or log-and-continue. + */ + upsert(row: RuntimeRunRow): Promise | void +} + +/** @stable */ +export interface RuntimeRunOptions { + workspaceId: string + sessionId?: string + agentId?: string + taskSpec: AgentTaskSpec + scenarioId?: string + /** Optional persistence adapter; if omitted, `persist()` is a no-op. */ + adapter?: RuntimeRunPersistenceAdapter + /** Override the row id; default = `${taskSpec.id}:${random suffix}`. */ + id?: string + /** Override the clock; default = `Date.now()`. Useful for deterministic tests. */ + now?: () => number +} + +/** @stable */ +export interface RuntimeRunHandle { + /** Stable id assigned at start. */ + readonly id: string + readonly workspaceId: string + readonly sessionId: string | undefined + readonly taskSpec: AgentTaskSpec + readonly status: RuntimeRunStatus + + /** + * Observe a single `RuntimeStreamEvent`. The handle ignores non-cost events + * (text deltas, tool calls) silently so consumers can pipe the whole stream + * through `handle.observe`. `llm_call` events update the ledger. + */ + observe(event: RuntimeStreamEvent): void + + /** Snapshot of the current cost ledger. Safe to call at any time. */ + cost(): RuntimeRunCost + + /** + * Transition to a terminal state. Idempotent for the same status; throws + * `RuntimeRunStateError` for a different terminal status (state machines + * don't time-travel). + */ + complete(input: RuntimeRunCompleteInput): void + + /** Build the current row without writing it. Useful for tests + dry runs. */ + toRow(metadata?: Record): RuntimeRunRow + + /** + * Persist the current row via the configured adapter. Must be called after + * `complete()`. Idempotent for the same terminal state (the adapter sees + * the same row on retry). + */ + persist(metadata?: Record): Promise +} + +/** + * @stable + * + * Construct a runtime-run handle. The returned handle is mutable across its + * lifetime; consumers should not share it across requests. + */ +export function startRuntimeRun(options: RuntimeRunOptions): RuntimeRunHandle { + if (!options.workspaceId) { + throw new ValidationError('startRuntimeRun: workspaceId is required') + } + if (!options.taskSpec?.id) { + throw new ValidationError('startRuntimeRun: taskSpec.id is required') + } + const now = options.now ?? Date.now + const startedAtMs = now() + const startedAt = new Date(startedAtMs).toISOString() + const id = options.id ?? `${options.taskSpec.id}:${randomSuffix()}` + + let status: RuntimeRunStatus = 'running' + let completedAtMs: number | undefined + let resultSummary: string | undefined + let error: string | undefined + let completionMetadata: Record | undefined + + const ledger: RuntimeRunCost = { + tokensIn: 0, + tokensOut: 0, + costUsd: 0, + wallMs: 0, + llmCalls: 0, + } + + const snapshotCost = (): RuntimeRunCost => ({ + tokensIn: ledger.tokensIn, + tokensOut: ledger.tokensOut, + costUsd: ledger.costUsd, + wallMs: (completedAtMs ?? now()) - startedAtMs, + llmCalls: ledger.llmCalls, + }) + + const buildRow = (extraMetadata?: Record): RuntimeRunRow => ({ + id, + workspaceId: options.workspaceId, + sessionId: options.sessionId, + agentId: options.agentId, + domain: options.taskSpec.domain, + taskId: options.taskSpec.id, + scenarioId: options.scenarioId, + status, + resultSummary, + error, + cost: snapshotCost(), + startedAt, + completedAt: completedAtMs !== undefined ? new Date(completedAtMs).toISOString() : undefined, + metadata: mergeMetadata(completionMetadata, extraMetadata), + }) + + return { + id, + workspaceId: options.workspaceId, + sessionId: options.sessionId, + taskSpec: options.taskSpec, + get status() { + return status + }, + observe(event) { + if (event.type !== 'llm_call') return + ledger.llmCalls += 1 + if (typeof event.tokensIn === 'number' && Number.isFinite(event.tokensIn)) { + ledger.tokensIn += event.tokensIn + } + if (typeof event.tokensOut === 'number' && Number.isFinite(event.tokensOut)) { + ledger.tokensOut += event.tokensOut + } + if (typeof event.costUsd === 'number' && Number.isFinite(event.costUsd)) { + ledger.costUsd += event.costUsd + } + }, + cost: snapshotCost, + complete(input) { + // `input.status` is typed `Exclude`, but + // a JS caller can still pass `'running'`. Validate defensively so the + // state machine is enforced at runtime, not just at compile time. + if ((input.status as RuntimeRunStatus) === 'running') { + throw new ValidationError('complete() requires a terminal status, got "running"') + } + if (status !== 'running') { + if (status === input.status) return + throw new RuntimeRunStateError( + `Cannot transition runtime run from "${status}" to "${input.status}"`, + ) + } + status = input.status + completedAtMs = now() + resultSummary = input.resultSummary + error = input.error + completionMetadata = input.metadata + if (input.cost) { + if (typeof input.cost.tokensIn === 'number' && Number.isFinite(input.cost.tokensIn)) { + ledger.tokensIn = input.cost.tokensIn + } + if (typeof input.cost.tokensOut === 'number' && Number.isFinite(input.cost.tokensOut)) { + ledger.tokensOut = input.cost.tokensOut + } + if (typeof input.cost.costUsd === 'number' && Number.isFinite(input.cost.costUsd)) { + ledger.costUsd = input.cost.costUsd + } + if (typeof input.cost.llmCalls === 'number' && Number.isFinite(input.cost.llmCalls)) { + ledger.llmCalls = input.cost.llmCalls + } + } + }, + toRow(metadata) { + return buildRow(metadata) + }, + async persist(metadata) { + if (status === 'running') { + throw new RuntimeRunStateError('Cannot persist a runtime run before complete() is called') + } + if (!options.adapter) return + await options.adapter.upsert(buildRow(metadata)) + }, + } +} + +function mergeMetadata( + base: Record | undefined, + extra: Record | undefined, +): Record | undefined { + if (!base && !extra) return undefined + return { ...(base ?? {}), ...(extra ?? {}) } +} + +function randomSuffix(): string { + // Short, collision-resistant-enough for an in-memory id. Adapters that + // require stronger guarantees pass `options.id` explicitly. + return Math.random().toString(36).slice(2, 10) +} diff --git a/src/sanitize.ts b/src/sanitize.ts new file mode 100644 index 0000000..6b51031 --- /dev/null +++ b/src/sanitize.ts @@ -0,0 +1,554 @@ +/** + * @stable + * + * Sanitization for runtime telemetry. The rule: nothing user-controlled leaks + * unless the caller opts in with a `RuntimeTelemetryOptions` flag. This is the + * envelope that ends up in `agent_run.metadata.runtimeEvents` on every + * consumer, so the default must be safe. + */ + +import type { + ControlEvalResult, + ControlRunResult, + ControlStep, + DataAcquisitionPlan, + KnowledgeReadinessReport, + KnowledgeRequirement, + UserQuestion, +} from '@tangle-network/agent-eval' + +import type { + AgentRuntimeEvent, + AgentTaskSpec, + AgentTaskStatus, + RuntimeSession, + RuntimeStreamEvent, +} from './types' + +/** @stable */ +export interface RuntimeTelemetryOptions { + /** + * Include raw task inputs. Off by default because task inputs often contain + * customer facts, credentials, source text, or internal IDs. + */ + includeInputs?: boolean + /** Include requirement descriptions. Secret requirements are always redacted. */ + includeRequirementDescriptions?: boolean + /** Include evidence IDs. Off by default; counts are safer for shared reports. */ + includeEvidenceIds?: boolean + /** Include user answers from question preflight. Off by default. */ + includeUserAnswers?: boolean + /** Include action payloads and action results for control steps. Off by default. */ + includeControlPayloads?: boolean + /** Include task metadata. Off by default because metadata may carry IDs or policy internals. */ + includeMetadata?: boolean + /** Include eval detail/evidence strings. Off by default because validators may echo private input. */ + includeEvalDetails?: boolean +} + +/** @stable */ +export interface SanitizedKnowledgeRequirement { + id: string + description?: string + requiredFor: string[] + category: KnowledgeRequirement['category'] + acquisitionMode: KnowledgeRequirement['acquisitionMode'] + importance: KnowledgeRequirement['importance'] + freshness: KnowledgeRequirement['freshness'] + sensitivity: KnowledgeRequirement['sensitivity'] + confidenceNeeded: number + currentConfidence: number + evidenceCount: number + evidenceIds?: string[] + fallbackPolicy: KnowledgeRequirement['fallbackPolicy'] +} + +/** @stable */ +export interface SanitizedKnowledgeReadinessReport { + taskId: string + readinessScore: number + recommendedAction: KnowledgeReadinessReport['recommendedAction'] + severity: KnowledgeReadinessReport['severity'] + reason: string + blockingMissingRequirements: SanitizedKnowledgeRequirement[] + nonBlockingGaps: SanitizedKnowledgeRequirement[] + evidenceCount: number + evidenceIds?: string[] + missingRequirementIds: string[] +} + +/** @stable */ +export function sanitizeKnowledgeReadinessReport( + report: KnowledgeReadinessReport, + options: RuntimeTelemetryOptions = {}, +): SanitizedKnowledgeReadinessReport { + return { + taskId: report.taskId, + readinessScore: report.readinessScore, + recommendedAction: report.recommendedAction, + severity: report.severity, + reason: report.reason, + blockingMissingRequirements: report.blockingMissingRequirements.map((requirement) => + sanitizeKnowledgeRequirement(requirement, options), + ), + nonBlockingGaps: report.nonBlockingGaps.map((requirement) => + sanitizeKnowledgeRequirement(requirement, options), + ), + evidenceCount: report.bundle.evidenceIds.length, + evidenceIds: options.includeEvidenceIds ? report.bundle.evidenceIds : undefined, + missingRequirementIds: report.bundle.missing.map((requirement) => requirement.id), + } +} + +/** @stable */ +export function sanitizeAgentRuntimeEvent< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult, +>( + event: AgentRuntimeEvent, + options: RuntimeTelemetryOptions = {}, +): Record { + const base = { type: event.type, task: sanitizeTask(event.task, options) } + if ( + event.type === 'readiness_start' || + event.type === 'task_start' || + event.type === 'control_start' + ) { + return event.type === 'control_start' + ? { ...base, knowledge: sanitizeKnowledgeReadinessReport(event.knowledge, options) } + : base + } + if (event.type === 'readiness_end') { + return { ...base, knowledge: sanitizeKnowledgeReadinessReport(event.knowledge, options) } + } + if (event.type === 'questions_start') { + return { + ...base, + questions: event.questions.map((question) => sanitizeQuestion(question, options)), + } + } + if (event.type === 'questions_end') { + return { + ...base, + questions: event.questions.map((question) => sanitizeQuestion(question, options)), + userAnswers: options.includeUserAnswers ? event.userAnswers : redactRecord(event.userAnswers), + } + } + if (event.type === 'acquisition_start') { + return { ...base, acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan) } + } + if (event.type === 'acquisition_end') { + return { + ...base, + acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan), + acquiredEvidenceCount: event.acquiredEvidenceIds.length, + acquiredEvidenceIds: options.includeEvidenceIds ? event.acquiredEvidenceIds : undefined, + } + } + if (event.type === 'control_step') { + return { ...base, step: sanitizeControlStep(event.step, options) } + } + if (event.type === 'control_end') { + return { ...base, control: sanitizeControlRun(event.control, options) } + } + return { ...base, status: event.status, reason: event.reason } +} + +/** @stable */ +export function sanitizeRuntimeStreamEvent( + event: RuntimeStreamEvent, + options: RuntimeTelemetryOptions = {}, +): Record { + const withTask = 'task' in event && event.task ? { task: sanitizeTask(event.task, options) } : {} + const withSession = + 'session' in event && event.session + ? { session: sanitizeRuntimeSession(event.session, options) } + : {} + + if (event.type === 'readiness_end') { + return { + type: event.type, + ...withTask, + timestamp: event.timestamp, + decision: event.decision, + knowledge: sanitizeKnowledgeReadinessReport(event.knowledge, options), + } + } + if (event.type === 'questions_start') { + return { + type: event.type, + ...withTask, + timestamp: event.timestamp, + questions: event.questions.map((question) => sanitizeQuestion(question, options)), + } + } + if (event.type === 'questions_end') { + return { + type: event.type, + ...withTask, + timestamp: event.timestamp, + questions: event.questions.map((question) => sanitizeQuestion(question, options)), + userAnswers: options.includeUserAnswers ? event.userAnswers : redactRecord(event.userAnswers), + } + } + if (event.type === 'acquisition_start') { + return { + type: event.type, + ...withTask, + timestamp: event.timestamp, + acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan), + } + } + if (event.type === 'acquisition_end') { + return { + type: event.type, + ...withTask, + timestamp: event.timestamp, + acquisitionPlans: event.acquisitionPlans.map(sanitizeAcquisitionPlan), + acquiredEvidenceCount: event.acquiredEvidenceIds.length, + acquiredEvidenceIds: options.includeEvidenceIds ? event.acquiredEvidenceIds : undefined, + } + } + if (event.type === 'tool_call') { + return { + type: event.type, + ...withTask, + ...withSession, + timestamp: event.timestamp, + toolName: event.toolName, + toolCallId: event.toolCallId, + args: options.includeControlPayloads ? event.args : undefined, + } + } + if (event.type === 'tool_result') { + return { + type: event.type, + ...withTask, + ...withSession, + timestamp: event.timestamp, + toolName: event.toolName, + toolCallId: event.toolCallId, + result: options.includeControlPayloads ? event.result : undefined, + } + } + if (event.type === 'llm_call') { + return { + type: event.type, + ...withTask, + ...withSession, + timestamp: event.timestamp, + model: event.model, + tokensIn: event.tokensIn, + tokensOut: event.tokensOut, + costUsd: event.costUsd, + latencyMs: event.latencyMs, + finishReason: event.finishReason, + } + } + if (event.type === 'artifact') { + return { + type: event.type, + ...withTask, + ...withSession, + timestamp: event.timestamp, + artifactId: event.artifactId, + name: event.name, + mimeType: event.mimeType, + uri: options.includeEvidenceIds ? event.uri : undefined, + metadata: options.includeMetadata ? event.metadata : undefined, + } + } + if (event.type === 'final') { + return { + type: event.type, + ...withTask, + ...withSession, + timestamp: event.timestamp, + status: event.status, + reason: event.reason, + text: options.includeControlPayloads ? event.text : undefined, + metadata: options.includeMetadata ? event.metadata : undefined, + } + } + return { + type: event.type, + ...withTask, + ...withSession, + timestamp: 'timestamp' in event ? event.timestamp : undefined, + ...pickPublicStreamFields(event), + } +} + +function sanitizeTask( + task: AgentTaskSpec, + options: RuntimeTelemetryOptions, +): Record { + return { + id: task.id, + intent: task.intent, + domain: task.domain, + inputs: options.includeInputs ? task.inputs : task.inputs ? '[redacted]' : undefined, + requiredKnowledge: task.requiredKnowledge?.map((requirement) => + sanitizeKnowledgeRequirement(requirement, options), + ), + metadata: options.includeMetadata ? task.metadata : task.metadata ? '[redacted]' : undefined, + } +} + +function sanitizeRuntimeSession( + session: RuntimeSession, + options: RuntimeTelemetryOptions, +): Record { + return { + id: session.id, + backend: session.backend, + status: session.status, + hasResumeToken: Boolean(session.resumeToken), + createdAt: session.createdAt, + updatedAt: session.updatedAt, + metadata: options.includeMetadata + ? session.metadata + : session.metadata + ? '[redacted]' + : undefined, + } +} + +function sanitizeKnowledgeRequirement( + requirement: KnowledgeRequirement, + options: RuntimeTelemetryOptions, +): SanitizedKnowledgeRequirement { + const includeDescription = + options.includeRequirementDescriptions && requirement.sensitivity !== 'secret' + return { + id: requirement.id, + description: includeDescription ? requirement.description : undefined, + requiredFor: requirement.requiredFor, + category: requirement.category, + acquisitionMode: requirement.acquisitionMode, + importance: requirement.importance, + freshness: requirement.freshness, + sensitivity: requirement.sensitivity, + confidenceNeeded: requirement.confidenceNeeded, + currentConfidence: requirement.currentConfidence, + evidenceCount: requirement.evidenceIds.length, + evidenceIds: options.includeEvidenceIds ? requirement.evidenceIds : undefined, + fallbackPolicy: requirement.fallbackPolicy, + } +} + +function sanitizeQuestion( + question: UserQuestion, + options: RuntimeTelemetryOptions, +): Record { + return { + id: question.id, + question: + options.includeRequirementDescriptions && question.answerType !== 'credential' + ? question.question + : undefined, + reason: options.includeRequirementDescriptions ? question.reason : undefined, + requirementId: question.requirementId, + importance: question.importance, + answerType: question.answerType, + impactIfUnknown: options.includeRequirementDescriptions ? question.impactIfUnknown : undefined, + optionCount: question.options?.length ?? 0, + } +} + +function sanitizeAcquisitionPlan(plan: DataAcquisitionPlan): Record { + return { + id: plan.id, + requirementIds: plan.requirementIds, + mode: plan.mode, + priority: plan.priority, + expectedEvidenceCount: plan.expectedEvidenceIds?.length ?? 0, + questionCount: plan.questions?.length ?? 0, + } +} + +function sanitizeControlStep( + step: ControlStep, + options: RuntimeTelemetryOptions, +): Record { + const actionOutcome = step.actionOutcome + return { + index: step.index, + decisionType: step.decision.type, + reason: step.decision.reason, + action: + options.includeControlPayloads && step.decision.type === 'continue' + ? step.decision.action + : undefined, + result: options.includeControlPayloads && actionOutcome?.ok ? actionOutcome.result : undefined, + actionOk: actionOutcome?.ok, + actionError: actionOutcome?.ok === false ? actionOutcome.error : undefined, + durationMs: actionOutcome?.durationMs, + evalsBefore: summarizeEvals(step.evalsBefore, options), + evalsAfter: summarizeEvals(step.evalsAfter, options), + startedAt: step.startedAt, + endedAt: step.endedAt, + } +} + +function sanitizeControlRun( + control: ControlRunResult, + options: RuntimeTelemetryOptions, +): Record { + return { + pass: control.pass, + completed: control.completed, + reason: control.reason, + score: control.score, + stepCount: control.steps.length, + wallMs: control.wallMs, + spentCostUsd: control.spentCostUsd, + failureClass: control.failureClass, + stoppedBy: control.stoppedBy, + runId: control.runId, + runtimeErrorCount: control.runtimeErrors.length, + finalEvals: summarizeEvals(control.finalEvals, options), + } +} + +function summarizeEvals( + evals: ControlEvalResult[], + options: RuntimeTelemetryOptions, +): Array> { + return evals.map((evalResult) => ({ + id: evalResult.id, + passed: evalResult.passed, + score: evalResult.score, + severity: evalResult.severity, + objective: evalResult.objective, + detail: options.includeEvalDetails ? evalResult.detail : undefined, + evidence: options.includeEvalDetails ? evalResult.evidence : undefined, + })) +} + +function redactRecord(record: Record): Record { + return Object.fromEntries(Object.keys(record).map((key) => [key, '[redacted]'])) +} + +function pickPublicStreamFields(event: RuntimeStreamEvent): Record { + if (event.type === 'session_created' || event.type === 'session_resumed') return {} + if (event.type === 'backend_start' || event.type === 'backend_end') + return { backend: event.backend } + if (event.type === 'backend_error') { + return { backend: event.backend, message: event.message, recoverable: event.recoverable } + } + if (event.type === 'task_end') return { status: event.status, reason: event.reason } + if (event.type === 'text_delta' || event.type === 'reasoning_delta') return { text: event.text } + return {} +} + +/** @stable */ +export interface RuntimeEventCollector< + TState = unknown, + TAction = unknown, + TActionResult = unknown, + TEval extends ControlEvalResult = ControlEvalResult, +> { + onEvent: (event: AgentRuntimeEvent) => void + events: Array> +} + +/** @stable */ +export type RuntimeStreamEventSink = (event: RuntimeStreamEvent) => void + +/** @stable */ +export interface RuntimeStreamEventSummary { + /** Total count of sanitized events collected. */ + eventCount: number + /** Count of events per `type`. Useful for log-line summaries. */ + eventCountsByType: Record + /** First session id observed in a `session_created` / `session_resumed` event, if any. */ + firstSessionId?: string + /** Last `final` event's status, if a final event was observed. */ + finalStatus?: AgentTaskStatus + /** Last `final` event's reason, if a final event was observed. */ + finalReason?: string + /** Concatenated `text_delta.text` across the stream, even when payloads are redacted. */ + finalText: string +} + +/** @stable */ +export interface RuntimeStreamEventCollector { + onEvent: RuntimeStreamEventSink + events: Array> + /** Snapshot of a small streaming-flavored summary derived from collected events. */ + summary(): RuntimeStreamEventSummary +} + +/** @stable */ +export function createRuntimeEventCollector< + TState = unknown, + TAction = unknown, + TActionResult = unknown, + TEval extends ControlEvalResult = ControlEvalResult, +>( + options: RuntimeTelemetryOptions = {}, +): RuntimeEventCollector { + const events: Array> = [] + return { + events, + onEvent: (event) => { + events.push(sanitizeAgentRuntimeEvent(event, options)) + }, + } +} + +/** + * @stable + * + * Streaming-event counterpart of `createRuntimeEventCollector`. Use this with + * `runAgentTaskStream` — pass each yielded event through `onEvent` and read + * the sanitized copies off `events`. The same `RuntimeTelemetryOptions` + * redaction flags apply. + * + * Stream and non-stream events have different field shapes (timestamps, + * sessions, text/tool deltas) so this is a sibling factory rather than an + * overload of `createRuntimeEventCollector`; the unified-union alternative + * was rejected because dispatching on `type` alone would silently misroute + * events whose `type` literals overlap (`task_start`, `readiness_end`, etc.). + */ +export function createRuntimeStreamEventCollector( + options: RuntimeTelemetryOptions = {}, +): RuntimeStreamEventCollector { + const events: Array> = [] + const eventCountsByType: Record = {} + let firstSessionId: string | undefined + let finalStatus: AgentTaskStatus | undefined + let finalReason: string | undefined + let finalText = '' + return { + events, + onEvent: (event) => { + events.push(sanitizeRuntimeStreamEvent(event, options)) + eventCountsByType[event.type] = (eventCountsByType[event.type] ?? 0) + 1 + if (event.type === 'text_delta') finalText += event.text + if ( + !firstSessionId && + (event.type === 'session_created' || event.type === 'session_resumed') + ) { + firstSessionId = event.session.id + } + if (event.type === 'final') { + finalStatus = event.status + finalReason = event.reason + } + }, + summary() { + return { + eventCount: events.length, + eventCountsByType: { ...eventCountsByType }, + firstSessionId, + finalStatus, + finalReason, + finalText, + } + }, + } +} diff --git a/src/sessions.ts b/src/sessions.ts new file mode 100644 index 0000000..403e3b6 --- /dev/null +++ b/src/sessions.ts @@ -0,0 +1,61 @@ +/** + * @stable + * + * Session helpers + an in-memory `RuntimeSessionStore` implementation suitable + * for tests, scratch processes, and per-request scratch storage in serverless + * runtimes. Durable stores (D1, postgres, Durable Objects) implement the same + * interface from `./types`. + */ + +import type { RuntimeSession, RuntimeSessionStore, RuntimeStreamEvent } from './types' + +/** @internal */ +export function newRuntimeSession( + backend: string, + requestedId?: string, + metadata?: Record, +): RuntimeSession { + const now = nowIso() + return { + id: requestedId || crypto.randomUUID(), + backend, + status: 'active', + createdAt: now, + updatedAt: now, + metadata, + } +} + +/** @internal */ +export function touchSession(session: RuntimeSession): RuntimeSession { + return { ...session, updatedAt: nowIso() } +} + +/** @internal */ +export function nowIso(): string { + return new Date().toISOString() +} + +/** @stable */ +export class InMemoryRuntimeSessionStore implements RuntimeSessionStore { + private readonly sessions = new Map() + private readonly events = new Map() + + get(sessionId: string): RuntimeSession | undefined { + return this.sessions.get(sessionId) + } + + put(session: RuntimeSession): void { + this.sessions.set(session.id, session) + } + + appendEvent(sessionId: string, event: RuntimeStreamEvent): void { + const existing = this.events.get(sessionId) ?? [] + existing.push(event) + this.events.set(sessionId, existing) + } + + listEvents(sessionId: string): RuntimeStreamEvent[] { + return [...(this.events.get(sessionId) ?? [])] + } +} diff --git a/src/sse.ts b/src/sse.ts new file mode 100644 index 0000000..98276f3 --- /dev/null +++ b/src/sse.ts @@ -0,0 +1,70 @@ +/** + * @stable + * + * Server-Sent Events serialization for runtime telemetry streams. + * + * Newline-safe by construction: any newline in `id` or `event` is collapsed to + * a space (browsers terminate fields on newline), and multi-line `data` + * payloads are split into one `data:` line per source line so JSON.stringify + * output transports cleanly. + */ + +import type { KnowledgeReadinessReport } from '@tangle-network/agent-eval' +import type { RuntimeTelemetryOptions } from './sanitize' +import { sanitizeKnowledgeReadinessReport, sanitizeRuntimeStreamEvent } from './sanitize' +import type { RuntimeStreamEvent } from './types' + +/** @stable */ +export interface ServerSentEventOptions { + event?: string + id?: string + retry?: number +} + +/** @stable */ +export function encodeServerSentEvent(data: unknown, options: ServerSentEventOptions = {}): string { + const lines: string[] = [] + if (options.id) lines.push(`id: ${stripNewlines(options.id)}`) + if (options.event) lines.push(`event: ${stripNewlines(options.event)}`) + if (typeof options.retry === 'number' && Number.isFinite(options.retry) && options.retry >= 0) { + lines.push(`retry: ${Math.floor(options.retry)}`) + } + + const payload = typeof data === 'string' ? data : JSON.stringify(data) + for (const line of payload.split(/\r?\n/)) { + lines.push(`data: ${line}`) + } + return `${lines.join('\n')}\n\n` +} + +/** @stable */ +export function readinessServerSentEvent( + report: KnowledgeReadinessReport, + options: RuntimeTelemetryOptions & ServerSentEventOptions = {}, +): string { + const { event, id, retry, ...telemetryOptions } = options + return encodeServerSentEvent( + { + type: 'readiness', + readiness: sanitizeKnowledgeReadinessReport(report, telemetryOptions), + }, + { event, id, retry }, + ) +} + +/** @stable */ +export function runtimeStreamServerSentEvent( + event: RuntimeStreamEvent, + options: RuntimeTelemetryOptions & ServerSentEventOptions = {}, +): string { + const { event: sseEvent, id, retry, ...telemetryOptions } = options + return encodeServerSentEvent(sanitizeRuntimeStreamEvent(event, telemetryOptions), { + event: sseEvent, + id, + retry, + }) +} + +function stripNewlines(value: string): string { + return value.replace(/[\r\n]/g, ' ') +} diff --git a/src/trace-bridge.ts b/src/trace-bridge.ts new file mode 100644 index 0000000..77ec758 --- /dev/null +++ b/src/trace-bridge.ts @@ -0,0 +1,261 @@ +/** + * @stable + * + * Bridge from runtime stream events to the agent-eval trace schema. + * + * Before this module, consumers (legal-agent's chat.ts, gtm-agent's runtime + * route) hand-rolled an adapter from `RuntimeStreamEvent` -> `TraceEvent` per + * repo. The mapping is mechanical and the destination schema is owned by + * agent-eval, so the adapter belongs in runtime, not in N consumer repos. + * + * The bridge is intentionally one-way (runtime -> agent-eval). The reverse + * mapping is degenerate (agent-eval events have no session / task affinity) + * and would invite consumers to round-trip through agent-eval, defeating the + * point of the runtime-specific shape. + */ + +import type { EventKind, TraceEvent } from '@tangle-network/agent-eval' + +import { ValidationError } from './errors' +import type { RuntimeStreamEvent } from './types' + +/** @stable */ +export interface TraceBridgeOptions { + /** + * Stable `runId` to stamp on every emitted `TraceEvent`. Required because + * agent-eval's `TraceEvent.runId` is non-optional. + */ + runId: string + /** + * Optional `spanId` to attach when an event maps to a known span (for + * example, an outer runtime-task span the consumer is already emitting). + */ + spanId?: string + /** + * Optional id generator; default = monotonic counter scoped to this bridge + * instance. Override for deterministic tests or to integrate with a wider + * id-allocator (uuid, ksuid). + */ + newEventId?: () => string +} + +/** @stable */ +export interface TraceBridge { + /** + * Map a single `RuntimeStreamEvent` to a `TraceEvent`. Returns `undefined` + * for events that have no useful trace projection (text deltas, reasoning + * deltas — these belong inside an `LlmSpan.output`, not as separate trace + * events). + */ + toTraceEvent(event: RuntimeStreamEvent): TraceEvent | undefined + /** Convenience: drain an iterable of stream events into trace events. */ + drain(events: Iterable): TraceEvent[] +} + +/** + * @stable + * + * Build a stateful bridge. State is intentionally minimal — only the event-id + * counter — because the runtime stream already carries timestamps and the + * caller already knows the `runId`. + */ +export function createTraceBridge(options: TraceBridgeOptions): TraceBridge { + if (!options.runId) { + throw new ValidationError('createTraceBridge: runId is required') + } + let counter = 0 + const newEventId = options.newEventId ?? (() => `evt-${++counter}`) + const baseSpanId = options.spanId + + const toTraceEvent = (event: RuntimeStreamEvent): TraceEvent | undefined => { + const projection = projectToTraceEvent(event) + if (!projection) return undefined + return { + eventId: newEventId(), + runId: options.runId, + spanId: baseSpanId, + kind: projection.kind, + timestamp: timestampFor(event), + payload: projection.payload, + } + } + + return { + toTraceEvent, + drain(events) { + const out: TraceEvent[] = [] + for (const event of events) { + const trace = toTraceEvent(event) + if (trace) out.push(trace) + } + return out + }, + } +} + +/** + * @stable + * + * One-shot convenience for callers who don't want to hold a bridge instance. + * Internally allocates a single-use bridge so id-generation stays consistent + * within the call. + */ +export function toAgentEvalTrace( + event: RuntimeStreamEvent, + options: TraceBridgeOptions, +): TraceEvent | undefined { + return createTraceBridge(options).toTraceEvent(event) +} + +interface TraceProjection { + kind: EventKind + payload: Record +} + +function projectToTraceEvent(event: RuntimeStreamEvent): TraceProjection | undefined { + switch (event.type) { + case 'task_start': + return { + kind: 'log', + payload: { phase: 'task_start', taskId: event.task.id, intent: event.task.intent }, + } + case 'readiness_start': + return { kind: 'log', payload: { phase: 'readiness_start', taskId: event.task.id } } + case 'readiness_end': + return { + kind: event.decision.passed ? 'log' : 'policy_violation', + payload: { + phase: 'readiness_end', + taskId: event.task.id, + status: event.decision.status, + readinessScore: event.decision.readinessScore, + blockingGapIds: event.decision.blockingGapIds, + nonBlockingGapIds: event.decision.nonBlockingGapIds, + reason: event.decision.reason, + }, + } + case 'questions_start': + return { + kind: 'log', + payload: { phase: 'questions_start', questionCount: event.questions.length }, + } + case 'questions_end': + return { + kind: 'log', + payload: { + phase: 'questions_end', + questionCount: event.questions.length, + answerCount: Object.keys(event.userAnswers).length, + }, + } + case 'acquisition_start': + return { + kind: 'log', + payload: { phase: 'acquisition_start', planCount: event.acquisitionPlans.length }, + } + case 'acquisition_end': + return { + kind: 'log', + payload: { + phase: 'acquisition_end', + planCount: event.acquisitionPlans.length, + evidenceCount: event.acquiredEvidenceIds.length, + }, + } + case 'session_created': + case 'session_resumed': + return { + kind: 'log', + payload: { + phase: event.type, + sessionId: event.session.id, + backend: event.session.backend, + }, + } + case 'backend_start': + case 'backend_end': + return { kind: 'log', payload: { phase: event.type, backend: event.backend } } + case 'backend_error': + return { + kind: 'error', + payload: { + backend: event.backend, + message: event.message, + recoverable: event.recoverable, + }, + } + case 'tool_call': + return { + kind: 'log', + payload: { + phase: 'tool_call', + toolName: event.toolName, + toolCallId: event.toolCallId, + // Args intentionally omitted at this layer; consumers attach the + // payload to a `ToolSpan` if they need to retain it. Trace events + // are point-in-time markers, not the canonical store for tool I/O. + }, + } + case 'tool_result': + return { + kind: 'log', + payload: { + phase: 'tool_result', + toolName: event.toolName, + toolCallId: event.toolCallId, + }, + } + case 'llm_call': + return { + kind: 'log', + payload: { + phase: 'llm_call', + model: event.model, + tokensIn: event.tokensIn, + tokensOut: event.tokensOut, + costUsd: event.costUsd, + latencyMs: event.latencyMs, + finishReason: event.finishReason, + }, + } + case 'artifact': + return { + kind: 'state_mutation', + payload: { + phase: 'artifact', + artifactId: event.artifactId, + name: event.name, + mimeType: event.mimeType, + }, + } + case 'task_end': + return { + kind: event.status === 'failed' || event.status === 'aborted' ? 'error' : 'log', + payload: { phase: 'task_end', status: event.status, reason: event.reason }, + } + case 'final': + return { + kind: event.status === 'failed' || event.status === 'aborted' ? 'error' : 'log', + payload: { phase: 'final', status: event.status, reason: event.reason }, + } + case 'text_delta': + case 'reasoning_delta': + // Token-level deltas don't map cleanly to `TraceEvent`. Consumers that + // want the final text should accumulate it into an `LlmSpan.output` or + // a `final` event, both of which the bridge does cover. + return undefined + default: { + // Exhaustiveness fallback; future event types should add a case above. + const exhaust: never = event + void exhaust + return undefined + } + } +} + +function timestampFor(event: RuntimeStreamEvent): number { + const iso = 'timestamp' in event ? event.timestamp : undefined + if (!iso) return Date.now() + const parsed = Date.parse(iso) + return Number.isFinite(parsed) ? parsed : Date.now() +} diff --git a/src/types.ts b/src/types.ts new file mode 100644 index 0000000..e90e55b --- /dev/null +++ b/src/types.ts @@ -0,0 +1,462 @@ +/** + * @stable + * + * Core task, session, adapter, and stream-event types for the runtime. + * + * This module owns the public shape of every cross-cutting record (`TaskSpec`, + * `RuntimeSession`, `RuntimeStreamEvent`). Everything else in the runtime + * imports from here so type-level changes ripple in one place. + */ + +import type { + ControlBudget, + ControlDecision, + ControlEvalResult, + ControlRunResult, + ControlStep, + DataAcquisitionPlan, + KnowledgeReadinessReport, + KnowledgeRequirement, + RunRecord, + TraceStore, + UserQuestion, +} from '@tangle-network/agent-eval' + +/** @stable */ +export interface AgentTaskSpec { + id: string + intent: string + /** Domain is metadata, not an architectural boundary: tax, legal, gtm, creative, blueprint, redteam, etc. */ + domain?: string + inputs?: Record + requiredKnowledge?: KnowledgeRequirement[] + budget?: Partial + metadata?: Record +} + +/** @stable */ +export interface AgentKnowledgeProvider { + buildReadiness?(task: AgentTaskSpec): Promise | KnowledgeReadinessReport + answerQuestions?( + questions: UserQuestion[], + task: AgentTaskSpec, + ): Promise> | Record + executeAcquisitionPlans?( + plans: DataAcquisitionPlan[], + task: AgentTaskSpec, + ): Promise | string[] + refreshReadiness?(input: { + task: AgentTaskSpec + previous: KnowledgeReadinessReport + userAnswers: Record + acquiredEvidenceIds: string[] + }): Promise | KnowledgeReadinessReport +} + +/** @stable */ +export interface AgentTaskContext< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult = ControlEvalResult, +> { + task: AgentTaskSpec + knowledge: KnowledgeReadinessReport + state: TState + evals: TEval[] + history: ControlStep[] + budget: ControlBudget + stepIndex: number + wallMs: number + spentCostUsd: number + remainingCostUsd?: number + abortSignal: AbortSignal +} + +/** @stable */ +export interface AgentAdapter< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult = ControlEvalResult, +> { + observe(ctx: { + task: AgentTaskSpec + knowledge: KnowledgeReadinessReport + history: ControlStep[] + abortSignal: AbortSignal + }): Promise | TState + + validate(ctx: { + task: AgentTaskSpec + knowledge: KnowledgeReadinessReport + state: TState + history: ControlStep[] + abortSignal: AbortSignal + }): Promise | TEval[] + + decide( + ctx: AgentTaskContext, + ): Promise> | ControlDecision + + act( + action: TAction, + ctx: AgentTaskContext, + ): Promise | TActionResult + + shouldStop?(ctx: AgentTaskContext): + | Promise<{ + stop: boolean + pass: boolean + reason: string + score?: number + }> + | { + stop: boolean + pass: boolean + reason: string + score?: number + } + + onKnowledgeBlocked?(ctx: { + task: AgentTaskSpec + knowledge: KnowledgeReadinessReport + questions: UserQuestion[] + acquisitionPlans: DataAcquisitionPlan[] + }): Promise> | ControlDecision + + getActionCostUsd?(ctx: { + action: TAction + result: TActionResult + task: AgentTaskSpec + state: TState + evals: TEval[] + history: ControlStep[] + }): number | undefined + + projectRunRecords?( + result: ControlRunResult, + task: AgentTaskSpec, + ): RunRecord[] +} + +/** @stable */ +export type AgentTaskStatus = 'completed' | 'blocked' | 'failed' | 'aborted' + +/** @stable */ +export type AgentRuntimeEvent< + TState = unknown, + TAction = unknown, + TActionResult = unknown, + TEval extends ControlEvalResult = ControlEvalResult, +> = + | { type: 'task_start'; task: AgentTaskSpec } + | { type: 'readiness_start'; task: AgentTaskSpec } + | { type: 'readiness_end'; task: AgentTaskSpec; knowledge: KnowledgeReadinessReport } + | { type: 'questions_start'; task: AgentTaskSpec; questions: UserQuestion[] } + | { + type: 'questions_end' + task: AgentTaskSpec + questions: UserQuestion[] + userAnswers: Record + } + | { + type: 'acquisition_start' + task: AgentTaskSpec + acquisitionPlans: DataAcquisitionPlan[] + } + | { + type: 'acquisition_end' + task: AgentTaskSpec + acquisitionPlans: DataAcquisitionPlan[] + acquiredEvidenceIds: string[] + } + | { type: 'control_start'; task: AgentTaskSpec; knowledge: KnowledgeReadinessReport } + | { + type: 'control_step' + task: AgentTaskSpec + step: ControlStep + } + | { + type: 'control_end' + task: AgentTaskSpec + control: ControlRunResult + } + | { type: 'task_end'; task: AgentTaskSpec; status: AgentTaskStatus; reason: string } + +/** @stable */ +export type AgentRuntimeEventSink< + TState = unknown, + TAction = unknown, + TActionResult = unknown, + TEval extends ControlEvalResult = ControlEvalResult, +> = (event: AgentRuntimeEvent) => Promise | void + +/** @stable */ +export type RuntimeStreamEvent = + | { type: 'task_start'; task: AgentTaskSpec; timestamp: string } + | { type: 'readiness_start'; task: AgentTaskSpec; timestamp: string } + | { + type: 'readiness_end' + task: AgentTaskSpec + knowledge: KnowledgeReadinessReport + decision: KnowledgeReadinessDecision + timestamp: string + } + | { + type: 'questions_start' + task: AgentTaskSpec + questions: UserQuestion[] + timestamp: string + } + | { + type: 'questions_end' + task: AgentTaskSpec + questions: UserQuestion[] + userAnswers: Record + timestamp: string + } + | { + type: 'acquisition_start' + task: AgentTaskSpec + acquisitionPlans: DataAcquisitionPlan[] + timestamp: string + } + | { + type: 'acquisition_end' + task: AgentTaskSpec + acquisitionPlans: DataAcquisitionPlan[] + acquiredEvidenceIds: string[] + timestamp: string + } + | { type: 'session_created'; task: AgentTaskSpec; session: RuntimeSession; timestamp: string } + | { type: 'session_resumed'; task: AgentTaskSpec; session: RuntimeSession; timestamp: string } + | { + type: 'backend_start' + task: AgentTaskSpec + session: RuntimeSession + backend: string + timestamp: string + } + | { + type: 'text_delta' + task?: AgentTaskSpec + session?: RuntimeSession + text: string + timestamp?: string + } + | { + type: 'reasoning_delta' + task?: AgentTaskSpec + session?: RuntimeSession + text: string + timestamp?: string + } + | { + type: 'tool_call' + task?: AgentTaskSpec + session?: RuntimeSession + toolName: string + toolCallId?: string + args?: unknown + timestamp?: string + } + | { + type: 'tool_result' + task?: AgentTaskSpec + session?: RuntimeSession + toolName: string + toolCallId?: string + result?: unknown + timestamp?: string + } + | { + type: 'llm_call' + task?: AgentTaskSpec + session?: RuntimeSession + model: string + tokensIn?: number + tokensOut?: number + costUsd?: number + latencyMs?: number + finishReason?: string + timestamp?: string + } + | { + type: 'artifact' + task?: AgentTaskSpec + session?: RuntimeSession + artifactId: string + name?: string + mimeType?: string + uri?: string + metadata?: Record + timestamp?: string + } + | { + type: 'backend_error' + task: AgentTaskSpec + session?: RuntimeSession + backend: string + message: string + recoverable: boolean + timestamp: string + } + | { + type: 'backend_end' + task: AgentTaskSpec + session: RuntimeSession + backend: string + timestamp: string + } + | { + type: 'task_end' + task: AgentTaskSpec + status: AgentTaskStatus + reason: string + timestamp: string + } + | { + type: 'final' + task: AgentTaskSpec + session?: RuntimeSession + status: AgentTaskStatus + reason: string + text?: string + metadata?: Record + timestamp: string + } + +/** @stable */ +export interface RuntimeSession { + id: string + backend: string + status: 'active' | 'completed' | 'failed' | 'aborted' + resumeToken?: string + createdAt: string + updatedAt: string + metadata?: Record +} + +/** @stable */ +export interface RuntimeSessionStore { + get(sessionId: string): Promise | RuntimeSession | undefined + put(session: RuntimeSession): Promise | void + appendEvent?(sessionId: string, event: RuntimeStreamEvent): Promise | void + listEvents?(sessionId: string): Promise | RuntimeStreamEvent[] +} + +/** @stable */ +export interface AgentBackendInput { + task: AgentTaskSpec + message?: string + messages?: Array<{ role: string; content: string }> + inputs?: Record +} + +/** @stable */ +export interface AgentBackendContext { + task: AgentTaskSpec + knowledge: KnowledgeReadinessReport + session: RuntimeSession + signal?: AbortSignal +} + +/** @stable */ +export interface AgentExecutionBackend { + kind: string + start?( + input: TInput, + context: Omit & { requestedSessionId?: string }, + ): Promise | RuntimeSession + resume?( + session: RuntimeSession, + input: TInput, + context: Omit, + ): Promise | RuntimeSession + stream(input: TInput, context: AgentBackendContext): AsyncIterable + stop?(session: RuntimeSession, reason: string): Promise | void +} + +/** @stable */ +export interface RunAgentTaskStreamOptions { + task: AgentTaskSpec + backend: AgentExecutionBackend + input?: Omit + knowledge?: AgentKnowledgeProvider + sessionStore?: RuntimeSessionStore + sessionId?: string + resume?: boolean + signal?: AbortSignal + minimumReadinessScore?: number +} + +/** @stable */ +export interface RunAgentTaskOptions< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult = ControlEvalResult, +> { + task: AgentTaskSpec + adapter: AgentAdapter + knowledge?: AgentKnowledgeProvider + onEvent?: AgentRuntimeEventSink + store?: TraceStore + signal?: AbortSignal + scenarioId?: string + projectId?: string + variantId?: string + minimumReadinessScore?: number +} + +/** @stable */ +export interface AgentTaskRunResult< + TState, + TAction, + TActionResult, + TEval extends ControlEvalResult = ControlEvalResult, +> { + task: AgentTaskSpec + status: AgentTaskStatus + knowledge: KnowledgeReadinessReport + questions: UserQuestion[] + acquisitionPlans: DataAcquisitionPlan[] + userAnswers: Record + acquiredEvidenceIds: string[] + control: ControlRunResult + runRecords: RunRecord[] +} + +/** @stable */ +export interface AgentTaskRunSummary { + taskId: string + domain?: string + status: AgentTaskStatus + reason: string + readinessStatus: KnowledgeReadinessDecision['status'] + readinessScore: number + recommendedAction: KnowledgeReadinessReport['recommendedAction'] + blockingGapIds: string[] + nonBlockingGapIds: string[] + questionCount: number + acquisitionPlanCount: number + acquiredEvidenceCount: number + controlStepCount: number + pass: boolean + failureClass?: string + wallMs: number + costUsd: number +} + +/** @stable */ +export interface KnowledgeReadinessDecision { + passed: boolean + status: 'ready' | 'blocked' | 'caveat' + reason: string + readinessScore: number + recommendedAction: KnowledgeReadinessReport['recommendedAction'] + severity: KnowledgeReadinessReport['severity'] + blockingGapIds: string[] + nonBlockingGapIds: string[] +} diff --git a/tests/runtime-run.test.ts b/tests/runtime-run.test.ts new file mode 100644 index 0000000..abd1168 --- /dev/null +++ b/tests/runtime-run.test.ts @@ -0,0 +1,249 @@ +import { describe, expect, it, vi } from 'vitest' +import { + type AgentTaskSpec, + type RuntimeRunPersistenceAdapter, + type RuntimeRunRow, + RuntimeRunStateError, + type RuntimeStreamEvent, + startRuntimeRun, + ValidationError, +} from '../src/index' + +const task: AgentTaskSpec = { + id: 'task-runtime-run', + intent: 'Review the latest filing', + domain: 'legal', + metadata: { workspaceId: 'ws-1' }, +} + +function llmCall( + partial: Partial>, +): RuntimeStreamEvent { + return { + type: 'llm_call', + model: 'claude-sonnet-4-6', + timestamp: new Date(0).toISOString(), + ...partial, + } +} + +describe('startRuntimeRun', () => { + it('rejects missing workspaceId and missing taskSpec.id', () => { + expect(() => startRuntimeRun({ workspaceId: '', taskSpec: task })).toThrow(ValidationError) + expect(() => startRuntimeRun({ workspaceId: 'ws-1', taskSpec: { ...task, id: '' } })).toThrow( + ValidationError, + ) + }) + + it('initializes with a stable id and defaults to running status', () => { + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + sessionId: 'thread-1', + taskSpec: task, + id: 'run-fixed', + now: () => 100, + }) + + expect(handle.id).toBe('run-fixed') + expect(handle.workspaceId).toBe('ws-1') + expect(handle.sessionId).toBe('thread-1') + expect(handle.status).toBe('running') + expect(handle.cost()).toEqual({ + tokensIn: 0, + tokensOut: 0, + costUsd: 0, + wallMs: 0, + llmCalls: 0, + }) + }) + + it('accumulates an llm cost ledger from llm_call events and exposes wall time', () => { + let clock = 1000 + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + taskSpec: task, + id: 'cost-run', + now: () => clock, + }) + + clock = 1200 + handle.observe(llmCall({ tokensIn: 100, tokensOut: 50, costUsd: 0.002 })) + clock = 1500 + handle.observe(llmCall({ tokensIn: 80, tokensOut: 20, costUsd: 0.001 })) + clock = 1500 + handle.observe( + // Non-cost events must not mutate the ledger. + { + type: 'tool_call', + toolName: 'shell', + timestamp: new Date(0).toISOString(), + }, + ) + + const cost = handle.cost() + expect(cost.tokensIn).toBe(180) + expect(cost.tokensOut).toBe(70) + expect(cost.costUsd).toBeCloseTo(0.003, 9) + expect(cost.llmCalls).toBe(2) + expect(cost.wallMs).toBe(500) + }) + + it('ignores non-finite numbers on llm_call events without polluting the ledger', () => { + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + taskSpec: task, + id: 'guard-run', + now: () => 0, + }) + handle.observe( + llmCall({ tokensIn: Number.NaN, tokensOut: Number.POSITIVE_INFINITY, costUsd: 0.5 }), + ) + const cost = handle.cost() + expect(cost.tokensIn).toBe(0) + expect(cost.tokensOut).toBe(0) + expect(cost.costUsd).toBe(0.5) + expect(cost.llmCalls).toBe(1) + }) + + it('completes idempotently with the same status and freezes wallMs at completion', () => { + let clock = 100 + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + taskSpec: task, + id: 'idempotent-run', + now: () => clock, + }) + clock = 350 + handle.complete({ status: 'completed', resultSummary: 'ok' }) + expect(handle.status).toBe('completed') + const firstCost = handle.cost() + + clock = 9999 + // Same terminal status is a no-op (does not throw, does not update wallMs). + handle.complete({ status: 'completed', resultSummary: 'ok-again' }) + expect(handle.status).toBe('completed') + const secondCost = handle.cost() + expect(firstCost.wallMs).toBe(250) + expect(secondCost.wallMs).toBe(250) + }) + + it('refuses to transition between two different terminal statuses', () => { + const handle = startRuntimeRun({ workspaceId: 'ws-1', taskSpec: task, id: 'no-time-travel' }) + handle.complete({ status: 'completed', resultSummary: 'done' }) + expect(() => handle.complete({ status: 'failed', error: 'too late' })).toThrow( + RuntimeRunStateError, + ) + }) + + it('persist() refuses to run before complete()', async () => { + const handle = startRuntimeRun({ workspaceId: 'ws-1', taskSpec: task, id: 'must-complete' }) + await expect(handle.persist()).rejects.toBeInstanceOf(RuntimeRunStateError) + }) + + it('persist() writes the canonical row to the adapter', async () => { + const rows: RuntimeRunRow[] = [] + const adapter: RuntimeRunPersistenceAdapter = { + upsert(row) { + rows.push(row) + }, + } + let clock = 0 + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + sessionId: 'thread-1', + agentId: 'legal-chat-runtime', + taskSpec: task, + adapter, + id: 'persist-run', + scenarioId: 'legal-chat:thread-1', + now: () => clock, + }) + clock = 50 + handle.observe(llmCall({ tokensIn: 100, tokensOut: 50, costUsd: 0.001 })) + clock = 300 + handle.complete({ + status: 'completed', + resultSummary: 'reviewed', + metadata: { threadId: 'thread-1' }, + }) + await handle.persist({ runtimeEvents: [{ type: 'final' }] }) + + expect(rows).toHaveLength(1) + const row = rows[0]! + expect(row.id).toBe('persist-run') + expect(row.workspaceId).toBe('ws-1') + expect(row.sessionId).toBe('thread-1') + expect(row.agentId).toBe('legal-chat-runtime') + expect(row.domain).toBe('legal') + expect(row.taskId).toBe('task-runtime-run') + expect(row.scenarioId).toBe('legal-chat:thread-1') + expect(row.status).toBe('completed') + expect(row.resultSummary).toBe('reviewed') + expect(row.cost.costUsd).toBeCloseTo(0.001, 9) + expect(row.cost.tokensIn).toBe(100) + expect(row.cost.tokensOut).toBe(50) + expect(row.cost.llmCalls).toBe(1) + expect(row.cost.wallMs).toBe(300) + expect(row.completedAt).toBe(new Date(300).toISOString()) + expect(row.metadata).toEqual({ threadId: 'thread-1', runtimeEvents: [{ type: 'final' }] }) + }) + + it('persist() propagates adapter errors so callers can decide whether to retry', async () => { + const adapter: RuntimeRunPersistenceAdapter = { + upsert: vi.fn(() => { + throw new Error('postgres timeout') + }), + } + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + taskSpec: task, + adapter, + id: 'persist-fail', + }) + handle.complete({ status: 'failed', error: 'sandbox dropped' }) + await expect(handle.persist()).rejects.toThrow('postgres timeout') + }) + + it('persist() is a no-op when no adapter is configured', async () => { + const handle = startRuntimeRun({ workspaceId: 'ws-1', taskSpec: task, id: 'no-adapter' }) + handle.complete({ status: 'completed', resultSummary: 'ok' }) + await expect(handle.persist()).resolves.toBeUndefined() + }) + + it('toRow() returns a row matching what persist() would write', async () => { + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + taskSpec: task, + id: 'dry-run', + now: () => 0, + }) + handle.observe(llmCall({ tokensIn: 10, tokensOut: 5, costUsd: 0.0001 })) + handle.complete({ status: 'completed', resultSummary: 'dry' }) + const row = handle.toRow({ extra: 'value' }) + expect(row.id).toBe('dry-run') + expect(row.status).toBe('completed') + expect(row.metadata).toEqual({ extra: 'value' }) + expect(row.cost.tokensIn).toBe(10) + }) + + it('complete() with an explicit cost override replaces the accumulated ledger', () => { + const handle = startRuntimeRun({ + workspaceId: 'ws-1', + taskSpec: task, + id: 'cost-override', + now: () => 0, + }) + handle.observe(llmCall({ tokensIn: 9999, tokensOut: 9999, costUsd: 9.99 })) + handle.complete({ + status: 'completed', + resultSummary: 'reconciled', + cost: { tokensIn: 100, tokensOut: 50, costUsd: 0.01, llmCalls: 1 }, + }) + expect(handle.cost()).toMatchObject({ + tokensIn: 100, + tokensOut: 50, + costUsd: 0.01, + llmCalls: 1, + }) + }) +}) diff --git a/tests/runtime.test.ts b/tests/runtime.test.ts index 72e8da6..5a3a8a5 100644 --- a/tests/runtime.test.ts +++ b/tests/runtime.test.ts @@ -1,26 +1,26 @@ import { describe, expect, it } from 'vitest' import { + type AgentAdapter, + type AgentBackendInput, + type AgentExecutionBackend, + type AgentTaskSpec, + type ControlEvalResult, createIterableBackend, createOpenAICompatibleBackend, - createSandboxPromptBackend, createRuntimeEventCollector, createRuntimeStreamEventCollector, + createSandboxPromptBackend, decideKnowledgeReadiness, encodeServerSentEvent, InMemoryRuntimeSessionStore, + type KnowledgeRequirement, + type RuntimeStreamEvent, readinessServerSentEvent, runAgentTask, runAgentTaskStream, sanitizeAgentRuntimeEvent, sanitizeRuntimeStreamEvent, summarizeAgentTaskRun, - type AgentAdapter, - type AgentExecutionBackend, - type AgentTaskSpec, - type ControlEvalResult, - type KnowledgeRequirement, - type RuntimeStreamEvent, - type AgentBackendInput, } from '../src/index' interface State { @@ -48,16 +48,19 @@ function adapter(): AgentAdapter { let current: State = { count: 0 } return { observe: () => current, - validate: ({ state }) => [{ - id: 'count-ready', - passed: state.count >= 1, - score: state.count >= 1 ? 1 : 0, - severity: 'info', - objective: true, - }], - decide: ({ state }) => state.count >= 1 - ? { type: 'stop', pass: true, score: 1, reason: 'done' } - : { type: 'continue', action: { type: 'increment' }, reason: 'need one step' }, + validate: ({ state }) => [ + { + id: 'count-ready', + passed: state.count >= 1, + score: state.count >= 1 ? 1 : 0, + severity: 'info', + objective: true, + }, + ], + decide: ({ state }) => + state.count >= 1 + ? { type: 'stop', pass: true, score: 1, reason: 'done' } + : { type: 'continue', action: { type: 'increment' }, reason: 'need one step' }, act: () => { current = { count: 1 } return current @@ -94,15 +97,17 @@ describe('runAgentTask', () => { id: 'task-2', intent: 'deploy', domain: 'legal', - requiredKnowledge: [{ - ...readyReq, - id: 'customer-secret', - description: 'Customer credential', - category: 'credential_or_secret', - acquisitionMode: 'ask_user', - sensitivity: 'secret', - currentConfidence: 0, - }], + requiredKnowledge: [ + { + ...readyReq, + id: 'customer-secret', + description: 'Customer credential', + category: 'credential_or_secret', + acquisitionMode: 'ask_user', + sensitivity: 'secret', + currentConfidence: 0, + }, + ], budget: { maxSteps: 3 }, } let acted = false @@ -197,13 +202,15 @@ describe('runAgentTask', () => { expect(result.userAnswers.question_build).toContain('pnpm') expect(result.acquiredEvidenceIds).toEqual(['page:build']) expect(result.knowledge.readinessScore).toBe(1) - expect(events).toEqual(expect.arrayContaining([ - 'questions_start', - 'questions_end', - 'acquisition_start', - 'acquisition_end', - 'control_step', - ])) + expect(events).toEqual( + expect.arrayContaining([ + 'questions_start', + 'questions_end', + 'acquisition_start', + 'acquisition_end', + 'control_step', + ]), + ) expect(events.filter((event) => event === 'questions_end')).toHaveLength(1) }) @@ -213,15 +220,17 @@ describe('runAgentTask', () => { intent: 'collect secret then run', domain: 'test', inputs: { apiKey: 'sk-secret' }, - requiredKnowledge: [{ - ...readyReq, - id: 'api-key', - description: 'Customer API key', - category: 'credential_or_secret', - acquisitionMode: 'ask_user', - sensitivity: 'secret', - currentConfidence: 0, - }], + requiredKnowledge: [ + { + ...readyReq, + id: 'api-key', + description: 'Customer API key', + category: 'credential_or_secret', + acquisitionMode: 'ask_user', + sensitivity: 'secret', + currentConfidence: 0, + }, + ], budget: { maxSteps: 3 }, } const collector = createRuntimeEventCollector() @@ -272,15 +281,17 @@ describe('runAgentTask', () => { inputs: { customer: 'Acme' }, requiredKnowledge: [readyReq], }, - questions: [{ - id: 'q1', - question: 'Please provide: Build command', - reason: 'Required for test.', - requirementId: 'build-command', - importance: 'blocking' as const, - answerType: 'free_text' as const, - impactIfUnknown: 'The agent should not run until this is known.', - }], + questions: [ + { + id: 'q1', + question: 'Please provide: Build command', + reason: 'Required for test.', + requirementId: 'build-command', + importance: 'blocking' as const, + answerType: 'free_text' as const, + impactIfUnknown: 'The agent should not run until this is known.', + }, + ], userAnswers: { q1: 'pnpm test' }, } @@ -312,17 +323,23 @@ describe('runAgentTask', () => { const caveatTask: AgentTaskSpec = { id: 'task-9', intent: 'caveat', - requiredKnowledge: [{ - ...readyReq, - importance: 'medium', - currentConfidence: 0.2, - fallbackPolicy: 'continue_with_caveat', - }], + requiredKnowledge: [ + { + ...readyReq, + importance: 'medium', + currentConfidence: 0.2, + fallbackPolicy: 'continue_with_caveat', + }, + ], } const ready = await runAgentTask({ task: readyTask, adapter: adapter() }) const blocked = await runAgentTask({ task: blockedTask, adapter: adapter() }) - const caveat = await runAgentTask({ task: caveatTask, adapter: adapter(), minimumReadinessScore: 0 }) + const caveat = await runAgentTask({ + task: caveatTask, + adapter: adapter(), + minimumReadinessScore: 0, + }) expect(decideKnowledgeReadiness(ready.knowledge).status).toBe('ready') expect(decideKnowledgeReadiness(blocked.knowledge).status).toBe('blocked') @@ -330,9 +347,9 @@ describe('runAgentTask', () => { }) it('encodes safe server-sent events for runtime telemetry streams', async () => { - expect(encodeServerSentEvent({ type: 'ping' }, { event: 'runtime\nbad', id: 'id\n1', retry: 1000 })).toBe( - 'id: id 1\nevent: runtime bad\nretry: 1000\ndata: {"type":"ping"}\n\n', - ) + expect( + encodeServerSentEvent({ type: 'ping' }, { event: 'runtime\nbad', id: 'id\n1', retry: 1000 }), + ).toBe('id: id 1\nevent: runtime bad\nretry: 1000\ndata: {"type":"ping"}\n\n') expect(encodeServerSentEvent('line one\nline two')).toBe('data: line one\ndata: line two\n\n') }) @@ -345,7 +362,9 @@ describe('runAgentTask', () => { }, adapter: adapter(), }) - const event = readinessServerSentEvent(result.knowledge, { includeRequirementDescriptions: true }) + const event = readinessServerSentEvent(result.knowledge, { + includeRequirementDescriptions: true, + }) const namedEvent = readinessServerSentEvent(result.knowledge, { event: 'readiness' }) expect(event).not.toContain('event:') @@ -412,21 +431,25 @@ describe('runAgentTask', () => { } const task = { id: 'stream-ready', intent: 'continue coding', requiredKnowledge: [readyReq] } - const first = await collect(runAgentTaskStream({ - task, - backend, - input: { message: 'hello' }, - sessionStore: store, - sessionId: 'session-1', - })) - const second = await collect(runAgentTaskStream({ - task, - backend, - input: { message: ' again' }, - sessionStore: store, - sessionId: 'session-1', - resume: true, - })) + const first = await collect( + runAgentTaskStream({ + task, + backend, + input: { message: 'hello' }, + sessionStore: store, + sessionId: 'session-1', + }), + ) + const second = await collect( + runAgentTaskStream({ + task, + backend, + input: { message: ' again' }, + sessionStore: store, + sessionId: 'session-1', + resume: true, + }), + ) expect(first.find((event) => event.type === 'session_created')).toBeDefined() expect(second.find((event) => event.type === 'session_resumed')).toBeDefined() @@ -436,7 +459,9 @@ describe('runAgentTask', () => { const toolCall = first.find((event) => event.type === 'tool_call')! expect(JSON.stringify(sanitizeRuntimeStreamEvent(toolCall))).not.toContain('secret.ts') - expect(JSON.stringify(sanitizeRuntimeStreamEvent(toolCall, { includeControlPayloads: true }))).toContain('secret.ts') + expect( + JSON.stringify(sanitizeRuntimeStreamEvent(toolCall, { includeControlPayloads: true })), + ).toContain('secret.ts') }) it('maps sandbox prompt events into runtime stream events', async () => { @@ -449,18 +474,25 @@ describe('runAgentTask', () => { yield { type: 'tool_result', data: { name: 'Read', output: 'ok' } } }, }) - const events = await collect(runAgentTaskStream({ - task: { id: 'sandbox-task', intent: 'inspect', requiredKnowledge: [readyReq] }, - backend, - input: { message: 'go' }, - })) + const events = await collect( + runAgentTaskStream({ + task: { id: 'sandbox-task', intent: 'inspect', requiredKnowledge: [readyReq] }, + backend, + input: { message: 'go' }, + }), + ) expect(events.find((event) => event.type === 'session_created')).toMatchObject({ type: 'session_created', session: { id: 'box-1', backend: 'sandbox' }, }) - expect(events.filter((event) => event.type === 'text_delta').map((event) => event.text)).toEqual(['hi']) - expect(events.find((event) => event.type === 'tool_call')).toMatchObject({ type: 'tool_call', toolName: 'Read' }) + expect( + events.filter((event) => event.type === 'text_delta').map((event) => event.text), + ).toEqual(['hi']) + expect(events.find((event) => event.type === 'tool_call')).toMatchObject({ + type: 'tool_call', + toolName: 'Read', + }) }) it('parses OpenAI-compatible streamed chat completions', async () => { @@ -468,20 +500,28 @@ describe('runAgentTask', () => { apiKey: 'sk-test', baseUrl: 'https://router.example/v1', model: 'model-a', - fetchImpl: async () => new Response( - 'data: {"choices":[{"delta":{"content":"hel"}}]}\n\n' - + 'data: {"choices":[{"delta":{"content":"lo"}}]}\n\n' - + 'data: [DONE]\n\n', - { status: 200 }, - ), + fetchImpl: async () => + new Response( + 'data: {"choices":[{"delta":{"content":"hel"}}]}\n\n' + + 'data: {"choices":[{"delta":{"content":"lo"}}]}\n\n' + + 'data: [DONE]\n\n', + { status: 200 }, + ), }) - const events = await collect(runAgentTaskStream({ - task: { id: 'chat-task', intent: 'say hello', requiredKnowledge: [readyReq] }, - backend, - input: { message: 'hello' }, - })) + const events = await collect( + runAgentTaskStream({ + task: { id: 'chat-task', intent: 'say hello', requiredKnowledge: [readyReq] }, + backend, + input: { message: 'hello' }, + }), + ) - expect(events.filter((event) => event.type === 'text_delta').map((event) => event.text).join('')).toBe('hello') + expect( + events + .filter((event) => event.type === 'text_delta') + .map((event) => event.text) + .join(''), + ).toBe('hello') expect(events.at(-1)).toMatchObject({ type: 'final', status: 'completed', text: 'hello' }) }) @@ -498,16 +538,21 @@ describe('runAgentTask', () => { throw new Error('sandbox lost') }, } - const events = await collect(runAgentTaskStream({ - task: { id: 'failing-task', intent: 'run', requiredKnowledge: [readyReq] }, - backend, - sessionStore: store, - sessionId: 'failing-session', - })) + const events = await collect( + runAgentTaskStream({ + task: { id: 'failing-task', intent: 'run', requiredKnowledge: [readyReq] }, + backend, + sessionStore: store, + sessionId: 'failing-session', + }), + ) expect(stopped).toEqual(['sandbox lost']) expect(store.get('failing-session')?.status).toBe('failed') - expect(store.listEvents('failing-session').at(-1)).toMatchObject({ type: 'final', status: 'failed' }) + expect(store.listEvents('failing-session').at(-1)).toMatchObject({ + type: 'final', + status: 'failed', + }) expect(events.find((event) => event.type === 'backend_error')).toMatchObject({ type: 'backend_error', backend: 'failing-harness', @@ -522,10 +567,40 @@ describe('runAgentTask', () => { const backend = createIterableBackend({ kind: 'fake-stream', async *stream(_input, ctx) { - yield { type: 'tool_call', task: ctx.task, session: ctx.session, toolName: 'shell', args: { cmd: 'rm -rf /etc/secret.txt' }, timestamp: '2026-05-10T00:00:00.000Z' } - yield { type: 'tool_result', task: ctx.task, session: ctx.session, toolName: 'shell', result: { stdout: 'sk-leaked' }, timestamp: '2026-05-10T00:00:00.000Z' } - yield { type: 'artifact', task: ctx.task, session: ctx.session, artifactId: 'a1', name: 'report.json', mimeType: 'application/json', uri: 's3://internal/secret-bucket/key', metadata: { customerId: 'cust-99' }, timestamp: '2026-05-10T00:00:00.000Z' } - yield { type: 'text_delta', task: ctx.task, session: ctx.session, text: 'hi from agent', timestamp: '2026-05-10T00:00:00.000Z' } + yield { + type: 'tool_call', + task: ctx.task, + session: ctx.session, + toolName: 'shell', + args: { cmd: 'rm -rf /etc/secret.txt' }, + timestamp: '2026-05-10T00:00:00.000Z', + } + yield { + type: 'tool_result', + task: ctx.task, + session: ctx.session, + toolName: 'shell', + result: { stdout: 'sk-leaked' }, + timestamp: '2026-05-10T00:00:00.000Z', + } + yield { + type: 'artifact', + task: ctx.task, + session: ctx.session, + artifactId: 'a1', + name: 'report.json', + mimeType: 'application/json', + uri: 's3://internal/secret-bucket/key', + metadata: { customerId: 'cust-99' }, + timestamp: '2026-05-10T00:00:00.000Z', + } + yield { + type: 'text_delta', + task: ctx.task, + session: ctx.session, + text: 'hi from agent', + timestamp: '2026-05-10T00:00:00.000Z', + } }, }) const task: AgentTaskSpec = { @@ -566,8 +641,24 @@ describe('runAgentTask', () => { const backend = createIterableBackend({ kind: 'fake-stream', async *stream(_input, ctx) { - yield { type: 'tool_call', task: ctx.task, session: ctx.session, toolName: 'shell', args: { cmd: 'pnpm test' }, timestamp: '2026-05-10T00:00:00.000Z' } - yield { type: 'artifact', task: ctx.task, session: ctx.session, artifactId: 'a1', name: 'r.json', uri: 's3://bucket/key', metadata: { customerId: 'cust-1' }, timestamp: '2026-05-10T00:00:00.000Z' } + yield { + type: 'tool_call', + task: ctx.task, + session: ctx.session, + toolName: 'shell', + args: { cmd: 'pnpm test' }, + timestamp: '2026-05-10T00:00:00.000Z', + } + yield { + type: 'artifact', + task: ctx.task, + session: ctx.session, + artifactId: 'a1', + name: 'r.json', + uri: 's3://bucket/key', + metadata: { customerId: 'cust-1' }, + timestamp: '2026-05-10T00:00:00.000Z', + } }, }) const task: AgentTaskSpec = { @@ -591,7 +682,13 @@ describe('runAgentTask', () => { const backend = createIterableBackend({ kind: 'fake-stream', async *stream(_input, ctx) { - yield { type: 'text_delta', task: ctx.task, session: ctx.session, text: 'partial', timestamp: '2026-05-10T00:00:00.000Z' } + yield { + type: 'text_delta', + task: ctx.task, + session: ctx.session, + text: 'partial', + timestamp: '2026-05-10T00:00:00.000Z', + } }, }) for await (const event of runAgentTaskStream({ @@ -619,14 +716,20 @@ describe('runAgentTask', () => { stop: () => { throw new Error('cleanup refused') }, + // Regression test: a stream generator that throws before any yield + // exercises the runtime's empty-event cleanup path (backend_error / + // task_end / final must still flow). + // biome-ignore lint/correctness/useYield: see comment above async *stream() { throw new Error('primary stream failure') }, } - const events = await collect(runAgentTaskStream({ - task: { id: 'cleanup-failure-task', intent: 'run', requiredKnowledge: [readyReq] }, - backend, - })) + const events = await collect( + runAgentTaskStream({ + task: { id: 'cleanup-failure-task', intent: 'run', requiredKnowledge: [readyReq] }, + backend, + }), + ) expect(events.find((event) => event.type === 'backend_error')).toMatchObject({ type: 'backend_error', diff --git a/tests/trace-bridge.test.ts b/tests/trace-bridge.test.ts new file mode 100644 index 0000000..7ccb7ae --- /dev/null +++ b/tests/trace-bridge.test.ts @@ -0,0 +1,208 @@ +import { describe, expect, it } from 'vitest' +import { + createTraceBridge, + type RuntimeStreamEvent, + toAgentEvalTrace, + ValidationError, +} from '../src/index' + +const task = { id: 'task-1', intent: 'Run a chat turn', domain: 'legal' } +const session = { + id: 'thread-1', + backend: 'tcloud', + status: 'active' as const, + createdAt: '2026-05-10T00:00:00.000Z', + updatedAt: '2026-05-10T00:00:00.000Z', +} + +describe('createTraceBridge', () => { + it('rejects construction without a runId', () => { + expect(() => createTraceBridge({ runId: '' })).toThrow(ValidationError) + }) + + it('maps lifecycle events to log-kind trace events with the runId stamped', () => { + const bridge = createTraceBridge({ runId: 'run-1', spanId: 'span-1' }) + const ts = '2026-05-10T00:00:00.000Z' + const taskStart = bridge.toTraceEvent({ type: 'task_start', task, timestamp: ts }) + expect(taskStart).toMatchObject({ + runId: 'run-1', + spanId: 'span-1', + kind: 'log', + payload: { phase: 'task_start', taskId: 'task-1' }, + }) + expect(taskStart?.timestamp).toBe(Date.parse(ts)) + }) + + it('maps readiness_end to policy_violation when the decision is blocked', () => { + const bridge = createTraceBridge({ runId: 'run-2' }) + const trace = bridge.toTraceEvent({ + type: 'readiness_end', + task, + timestamp: '2026-05-10T00:00:00.000Z', + knowledge: { + taskId: 'task-1', + readinessScore: 0, + reason: 'missing', + severity: 'error', + recommendedAction: 'collect_missing_data', + blockingMissingRequirements: [], + nonBlockingGaps: [], + bundle: { + taskId: 'task-1', + readinessScore: 0, + missing: [], + evidence: [], + evidenceIds: [], + userAnswers: {}, + }, + }, + decision: { + passed: false, + status: 'blocked', + reason: 'missing', + readinessScore: 0, + recommendedAction: 'collect_missing_data', + severity: 'error', + blockingGapIds: ['missing-doc'], + nonBlockingGapIds: [], + }, + }) + expect(trace?.kind).toBe('policy_violation') + expect(trace?.payload).toMatchObject({ + status: 'blocked', + blockingGapIds: ['missing-doc'], + }) + }) + + it('maps backend_error and failed task_end to error kind', () => { + const bridge = createTraceBridge({ runId: 'run-3' }) + const backendError = bridge.toTraceEvent({ + type: 'backend_error', + task, + session, + backend: 'tcloud', + message: 'sandbox lost', + recoverable: true, + timestamp: '2026-05-10T00:00:00.000Z', + }) + expect(backendError?.kind).toBe('error') + expect(backendError?.payload).toMatchObject({ message: 'sandbox lost', recoverable: true }) + + const failedTaskEnd = bridge.toTraceEvent({ + type: 'task_end', + task, + status: 'failed', + reason: 'sandbox lost', + timestamp: '2026-05-10T00:00:00.000Z', + }) + expect(failedTaskEnd?.kind).toBe('error') + }) + + it('drops text_delta and reasoning_delta — they belong inside an LlmSpan', () => { + const bridge = createTraceBridge({ runId: 'run-4' }) + expect( + bridge.toTraceEvent({ + type: 'text_delta', + task, + session, + text: 'hi', + timestamp: '2026-05-10T00:00:00.000Z', + }), + ).toBeUndefined() + expect( + bridge.toTraceEvent({ + type: 'reasoning_delta', + task, + session, + text: 'thinking', + timestamp: '2026-05-10T00:00:00.000Z', + }), + ).toBeUndefined() + }) + + it('maps llm_call into a log-kind trace event carrying tokens + cost', () => { + const bridge = createTraceBridge({ runId: 'run-5' }) + const trace = bridge.toTraceEvent({ + type: 'llm_call', + task, + session, + model: 'claude-sonnet-4-6', + tokensIn: 100, + tokensOut: 50, + costUsd: 0.001, + latencyMs: 320, + finishReason: 'stop', + timestamp: '2026-05-10T00:00:00.000Z', + }) + expect(trace?.kind).toBe('log') + expect(trace?.payload).toMatchObject({ + phase: 'llm_call', + model: 'claude-sonnet-4-6', + tokensIn: 100, + tokensOut: 50, + costUsd: 0.001, + }) + }) + + it('drain() projects a full stream into trace events in order', () => { + const bridge = createTraceBridge({ runId: 'run-6' }) + const events: RuntimeStreamEvent[] = [ + { type: 'task_start', task, timestamp: '2026-05-10T00:00:00.000Z' }, + { type: 'readiness_start', task, timestamp: '2026-05-10T00:00:01.000Z' }, + { + type: 'text_delta', + task, + session, + text: 'dropped', + timestamp: '2026-05-10T00:00:02.000Z', + }, + { + type: 'task_end', + task, + status: 'completed', + reason: 'ok', + timestamp: '2026-05-10T00:00:03.000Z', + }, + ] + const traces = bridge.drain(events) + expect(traces.map((trace) => trace.payload.phase)).toEqual([ + 'task_start', + 'readiness_start', + 'task_end', + ]) + expect(traces.map((trace) => trace.eventId)).toEqual(['evt-1', 'evt-2', 'evt-3']) + }) + + it('toAgentEvalTrace() one-shot matches createTraceBridge.toTraceEvent()', () => { + const event: RuntimeStreamEvent = { + type: 'task_start', + task, + timestamp: '2026-05-10T00:00:00.000Z', + } + const oneShot = toAgentEvalTrace(event, { runId: 'run-7' }) + expect(oneShot).toMatchObject({ runId: 'run-7', kind: 'log' }) + }) + + it('falls back to Date.now() when an event lacks a timestamp', () => { + const bridge = createTraceBridge({ runId: 'run-8' }) + const before = Date.now() + const trace = bridge.toTraceEvent({ + type: 'text_delta', + task, + session, + text: 'untimed', + }) + expect(trace).toBeUndefined() + + // tool_call is the simplest event whose timestamp may legitimately be + // omitted by a backend; the bridge must still produce a valid trace. + const toolCall = bridge.toTraceEvent({ + type: 'tool_call', + task, + session, + toolName: 'shell', + }) + expect(toolCall).toBeDefined() + expect(toolCall!.timestamp).toBeGreaterThanOrEqual(before) + }) +}) diff --git a/tsconfig.json b/tsconfig.json index 51a8087..a8b383f 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -16,7 +16,8 @@ "isolatedModules": true, "noUnusedLocals": true, "noUnusedParameters": true, - "noFallthroughCasesInSwitch": true + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true }, "include": ["src"], "exclude": ["node_modules", "dist", "tests"]