diff --git a/EVENTS.md b/EVENTS.md index 45a6ff3..696add3 100644 --- a/EVENTS.md +++ b/EVENTS.md @@ -82,13 +82,36 @@ Emitted when an assistant message finishes (one per LLM turn). "providerID": "anthropic", "agent": "default", "cost": { "input": 0.003, "output": 0.012, "cache": { "read": 0, "write": 0 } }, - "tokens": { "input": 1024, "output": 512, "cache": { "read": 0, "write": 0 } }, + "tokens": { + "input": 1024, + "output": 512, + "reasoning": 0, + "cache": { "read": 8800, "write": 1024 } + }, + "context": { "used": 10848, "limit": 200000, "ratio": 0.05424 }, "finish": "tool-calls" } ``` `finish` values: `"tool-calls"` (model wants to call tools), `"end_turn"` (model is done), `"max_tokens"` (output truncated). +**`tokens`** (5-way breakdown, mirrors upstream `LLM.Usage`): + +- `input` (number) — raw input tokens billed at the standard input rate. +- `output` (number) — output (completion) tokens. +- `reasoning` (number) — extended-thinking / reasoning tokens (0 when thinking is off). +- `cache.read` (number) — tokens served from the prompt cache (billed at cache-read rate). Distinguishes cache hits from fresh input. +- `cache.write` (number) — tokens written to the prompt cache (billed at cache-write rate). + +These fields are non-overlapping: a token is counted in exactly one bucket. + +**`context`** (context-window utilization): + +- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read + cache.write`. +- `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`). +- `ratio` (number) — `used / limit` (≥0; may exceed 1 if usage exceeds the model's registered limit). A value approaching or exceeding 1 signals context-exhaustion risk. +- `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint). + ### `text` Emitted when a text block from the assistant is complete. diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts index a4cfb2b..754149a 100644 --- a/packages/cli/src/cli/cmd/run.ts +++ b/packages/cli/src/cli/cmd/run.ts @@ -80,6 +80,31 @@ function fallback(part: ToolPart) { }) } +/** + * Build the context-window utilization object for `message_complete` events. + * + * Returns `null` (meaning "unknown") when: + * - `contextLimit` is `null` — Provider.getModel threw (unregistered model) + * - `contextLimit` is `0` — custom model without a registered limit defaults to + * `limit.context = 0` (the provider's default for unregistered custom models). + * A zero limit would yield `Infinity`/`NaN` for ratio, which `JSON.stringify` + * serialises as `null` inside the object — diverging from the documented + * top-level `null` contract (EVENTS.md). + * + * @internal exported for unit-testing only + */ +export function buildContextWindow( + contextLimit: number | null, + contextUsed: number, +): { used: number; limit: number; ratio: number } | null { + if (contextLimit == null || contextLimit <= 0) return null + return { + used: contextUsed, + limit: contextLimit, + ratio: contextUsed / contextLimit, + } +} + function glob(info: ToolProps) { const root = info.input.path ?? "" const title = `Glob "${info.input.pattern}"` @@ -463,12 +488,46 @@ export const RunCommand = cmd({ const info = event.properties.info if (args.format === "json") { if (info.finish) { + // Build 5-way token breakdown mirroring upstream LLM.Usage shape. + // info.tokens already carries the full breakdown from StepFinishPart + // accumulation — reasoning and cache split are not dropped upstream. + const tokens = { + input: info.tokens.input, + output: info.tokens.output, + reasoning: info.tokens.reasoning, + cache: { + read: info.tokens.cache.read, + write: info.tokens.cache.write, + }, + } + + // Context-window utilization: used = input + cache.read + cache.write + // (all prompt tokens that occupy the model's context window this turn). + // cache.write tokens are written to the cache ON this turn — they are + // part of the prompt sent to the model and count against the context + // window, just billed at the cache-write rate. Excluding them + // undercounts utilization on the first turn of a conversation. + // limit comes from the model registry (models.dev). On lookup failure + // (or limit===0 for custom models) buildContextWindow returns null. + const contextLimit = await Provider.getModel(info.providerID, info.modelID) + .then((m) => m.limit.context) + .catch((e) => { + if (e instanceof Provider.ModelNotFoundError) return null + throw e + }) + const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write + const context = buildContextWindow(contextLimit, contextUsed) + emit("message_complete", { modelID: info.modelID, providerID: info.providerID, agent: info.agent, + // cost is sourced from info.cost which accumulates real per-step costs + // from StepFinishPart. Do NOT use the new step.ended event cost field + // which emits cost:0 and is reconciled later (the cost:0 trap). cost: info.cost, - tokens: info.tokens, + tokens, + context, finish: info.finish, }) } diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts new file mode 100644 index 0000000..cf68baf --- /dev/null +++ b/packages/cli/test/cli/usage-token-breakdown.test.ts @@ -0,0 +1,191 @@ +import path from "path" +import { describe, expect, test } from "bun:test" +import { buildContextWindow } from "../../src/cli/cmd/run" + +const EVENTS_MD = path.resolve(import.meta.dir, "../../../../EVENTS.md") + +/** + * Tests for issue #86 — 5-way token breakdown + context-window utilization + * in message_complete events. + */ +describe("buildContextWindow (#86)", () => { + // 🟠 regression: limit===0 (custom model default) must return null, not {ratio:Infinity} + test("returns null when contextLimit is 0 (custom model without registered limit)", () => { + const result = buildContextWindow(0, 9824) + expect(result).toBeNull() + }) + + test("returns null when contextLimit is null (Provider.getModel threw)", () => { + const result = buildContextWindow(null, 9824) + expect(result).toBeNull() + }) + + test("returns null when both limit and used are 0", () => { + const result = buildContextWindow(0, 0) + expect(result).toBeNull() + }) + + test("computes used as the value passed in (caller sets input + cache.read + cache.write)", () => { + // Regression: cache.write must be included in contextUsed at the call site (run.ts). + // buildContextWindow receives the pre-summed value; this test verifies the helper + // honours it (and that the sum is documented correctly: input + cache.read + cache.write). + const input = 1024 + const cacheRead = 8800 + const cacheWrite = 1024 + const contextUsed = input + cacheRead + cacheWrite // = 10848 + const result = buildContextWindow(200_000, contextUsed) + expect(result).not.toBeNull() + expect(result!.used).toBe(10848) + }) + + test("sets limit to the contextLimit value", () => { + const result = buildContextWindow(200_000, 9824) + expect(result).not.toBeNull() + expect(result!.limit).toBe(200_000) + }) + + test("ratio is used / limit", () => { + const result = buildContextWindow(200_000, 9824) + expect(result).not.toBeNull() + expect(result!.ratio).toBeCloseTo(9824 / 200_000, 10) + }) + + test("ratio is between 0 and 1 for realistic values", () => { + const result = buildContextWindow(128_000, 64_000) + expect(result).not.toBeNull() + expect(result!.ratio).toBe(0.5) + }) + + test("ratio is exactly 1 when context is fully used", () => { + const result = buildContextWindow(100_000, 100_000) + expect(result).not.toBeNull() + expect(result!.ratio).toBe(1) + }) + + test("ratio is 0 when no tokens used (empty prompt start)", () => { + const result = buildContextWindow(200_000, 0) + expect(result).not.toBeNull() + expect(result!.ratio).toBe(0) + }) + + test("result is JSON-serialisable without Infinity or NaN", () => { + const result = buildContextWindow(200_000, 9824) + const serialised = JSON.stringify(result) + expect(serialised).not.toContain("null") + const parsed = JSON.parse(serialised) + expect(parsed.ratio).toBeCloseTo(9824 / 200_000, 10) + }) + + test("top-level null serialises cleanly (not as object with null ratio)", () => { + // The documented contract: limit unknown → top-level null, not {ratio:null} + const result = buildContextWindow(0, 9824) + expect(JSON.stringify(result)).toBe("null") + }) + + test("ratio may exceed 1 when usage exceeds the registered limit (unclamped)", () => { + // EVENTS.md documents ratio as ≥0 (may exceed 1), not clamped to [0,1]. + // Stale/lowered model.limit.context can produce ratio > 1 in production. + const result = buildContextWindow(100_000, 110_000) + expect(result).not.toBeNull() + expect(result!.ratio).toBeGreaterThan(1) + expect(result!.ratio).toBeCloseTo(1.1, 10) + }) +}) + +describe("message_complete emit block shape (source-verified, #86)", () => { + // These source-text checks verify structural wiring in the emit call site + // that cannot be covered by pure unit-testing buildContextWindow. + const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts") + + test("emit block passes tokens with reasoning and cache read/write fields", async () => { + const source = await Bun.file(RUN_SRC).text() + const emitIdx = source.indexOf('emit("message_complete"') + expect(emitIdx).toBeGreaterThan(-1) + const blockStart = Math.max(0, emitIdx - 1500) + const block = source.slice(blockStart, emitIdx + 200) + expect(block).toContain("reasoning") + expect(block).toContain("cache") + expect(block).toContain("read") + expect(block).toContain("write") + }) + + test("emit block calls buildContextWindow (not inline ternary)", async () => { + const source = await Bun.file(RUN_SRC).text() + const emitIdx = source.indexOf('emit("message_complete"') + expect(emitIdx).toBeGreaterThan(-1) + const blockStart = Math.max(0, emitIdx - 1500) + const block = source.slice(blockStart, emitIdx + 200) + expect(block).toContain("buildContextWindow") + }) + + test("emit block includes context field", async () => { + const source = await Bun.file(RUN_SRC).text() + const emitIdx = source.indexOf('emit("message_complete"') + expect(emitIdx).toBeGreaterThan(-1) + // emit object spans ~400 chars; search up to closing paren + const block = source.slice(emitIdx, emitIdx + 500) + expect(block).toContain("context") + }) + + test("cost field is still emitted (not regressed)", async () => { + const source = await Bun.file(RUN_SRC).text() + const idx = source.indexOf('emit("message_complete"') + expect(idx).toBeGreaterThan(-1) + const block = source.slice(idx, idx + 800) + expect(block).toContain("cost:") + }) + + test("getModel catch rethrows non-ModelNotFoundError (targeted catch, not swallow-all)", async () => { + // Verify the catch block only silences ModelNotFoundError; unexpected errors must propagate. + // Source check: catch body must reference ModelNotFoundError (not be an empty arrow). + const source = await Bun.file(RUN_SRC).text() + const getModelIdx = source.indexOf("Provider.getModel(info.providerID") + expect(getModelIdx).toBeGreaterThan(-1) + const catchWindow = source.slice(getModelIdx, getModelIdx + 400) + expect(catchWindow).toContain("ModelNotFoundError") + expect(catchWindow).toContain("throw e") + }) + + test("contextUsed includes cache.write (regression: must not omit cache.write from context sum)", async () => { + // Regression for the bug where `contextUsed = tokens.input + tokens.cache.read` + // omitted cache.write, undercounting first-turn utilization. The fix is: + // const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write + // This source-text check verifies the three-way sum is present at the call site. + const source = await Bun.file(RUN_SRC).text() + const contextUsedIdx = source.indexOf("const contextUsed =") + expect(contextUsedIdx).toBeGreaterThan(-1) + const line = source.slice(contextUsedIdx, contextUsedIdx + 100) + expect(line).toContain("cache.write") + }) +}) + +describe("EVENTS.md documents token breakdown and context (#86)", () => { + test("EVENTS.md message_complete section includes reasoning token field", async () => { + const doc = await Bun.file(EVENTS_MD).text() + const idx = doc.indexOf("message_complete") + expect(idx).toBeGreaterThan(-1) + const section = doc.slice(idx, idx + 1500) + expect(section).toContain("reasoning") + }) + + test("EVENTS.md message_complete section documents context field", async () => { + const doc = await Bun.file(EVENTS_MD).text() + const idx = doc.indexOf("message_complete") + expect(idx).toBeGreaterThan(-1) + const section = doc.slice(idx, idx + 1500) + expect(section).toContain("context") + expect(section).toContain("used") + expect(section).toContain("limit") + expect(section).toContain("ratio") + }) + + test("EVENTS.md documents null as the unknown-limit sentinel", async () => { + const doc = await Bun.file(EVENTS_MD).text() + const idx = doc.indexOf("message_complete") + expect(idx).toBeGreaterThan(-1) + // null sentinel doc is ~1593 chars after message_complete heading; use 2000 window + const section = doc.slice(idx, idx + 2000) + // The documented contract: null = context limit not known + expect(section).toContain("null") + }) +})