From 7c804223f78939665f0d34c0b3f1d26449c51f28 Mon Sep 17 00:00:00 2001 From: Bulat Yapparov Date: Mon, 22 Jun 2026 04:40:32 +0100 Subject: [PATCH 1/4] feat(events): emit 5-way token breakdown + context-window utilization in message_complete (#86) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Expand `tokens` in `message_complete` from an opaque `info.tokens` passthrough to an explicit object with all 5 fields: input / output / reasoning / cache.read / cache.write — mirroring upstream LLM.Usage shape. The data was already captured in MessageV2.Assistant.tokens via StepFinishPart accumulation; this change surfaces it explicitly. - Add `context: { used, limit, ratio }` to `message_complete`: - `used = input + cache.read` (tokens occupying the context window this turn) - `limit` sourced from Provider.getModel() → model.limit.context (models.dev) - `ratio = used / limit`; emits `null` when limit is unknown (unregistered endpoint) - Cost kept correct: `info.cost` accumulates real per-step cost from StepFinishPart, NOT from the new step.ended event which emits cost:0 (the cost:0 trap). - Update EVENTS.md with the extended schema and field-by-field documentation. - Add TDD test file (RED→GREEN): `test/cli/usage-token-breakdown.test.ts`. Co-Authored-By: Claude Sonnet 4.6 Claude-Session: https://claude.ai/code/session_0187MsfK1upr6K2BKVbmaebQ --- EVENTS.md | 25 ++++- packages/cli/src/cli/cmd/run.ts | 36 ++++++- .../test/cli/usage-token-breakdown.test.ts | 93 +++++++++++++++++++ 3 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 packages/cli/test/cli/usage-token-breakdown.test.ts diff --git a/EVENTS.md b/EVENTS.md index 45a6ff3..9c456cb 100644 --- a/EVENTS.md +++ b/EVENTS.md @@ -82,13 +82,36 @@ Emitted when an assistant message finishes (one per LLM turn). "providerID": "anthropic", "agent": "default", "cost": { "input": 0.003, "output": 0.012, "cache": { "read": 0, "write": 0 } }, - "tokens": { "input": 1024, "output": 512, "cache": { "read": 0, "write": 0 } }, + "tokens": { + "input": 1024, + "output": 512, + "reasoning": 0, + "cache": { "read": 8800, "write": 1024 } + }, + "context": { "used": 9824, "limit": 200000, "ratio": 0.049 }, "finish": "tool-calls" } ``` `finish` values: `"tool-calls"` (model wants to call tools), `"end_turn"` (model is done), `"max_tokens"` (output truncated). +**`tokens`** (5-way breakdown, mirrors upstream `LLM.Usage`): + +- `input` (number) — raw input tokens billed at the standard input rate. +- `output` (number) — output (completion) tokens. +- `reasoning` (number) — extended-thinking / reasoning tokens (0 when thinking is off). +- `cache.read` (number) — tokens served from the prompt cache (billed at cache-read rate). Distinguishes cache hits from fresh input. +- `cache.write` (number) — tokens written to the prompt cache (billed at cache-write rate). + +These fields are non-overlapping: a token is counted in exactly one bucket. + +**`context`** (context-window utilization): + +- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read`. +- `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`). +- `ratio` (number) — `used / limit` (0–1). A value approaching 1 signals context-exhaustion risk. +- `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint). + ### `text` Emitted when a text block from the assistant is complete. diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts index a4cfb2b..0abb7d2 100644 --- a/packages/cli/src/cli/cmd/run.ts +++ b/packages/cli/src/cli/cmd/run.ts @@ -463,12 +463,46 @@ export const RunCommand = cmd({ const info = event.properties.info if (args.format === "json") { if (info.finish) { + // Build 5-way token breakdown mirroring upstream LLM.Usage shape. + // info.tokens already carries the full breakdown from StepFinishPart + // accumulation — reasoning and cache split are not dropped upstream. + const tokens = { + input: info.tokens.input, + output: info.tokens.output, + reasoning: info.tokens.reasoning, + cache: { + read: info.tokens.cache.read, + write: info.tokens.cache.write, + }, + } + + // Context-window utilization: used = input + cache.read (prompt tokens + // that actually hit the model's context window). limit comes from the + // model registry (models.dev). On lookup failure we emit null to signal + // "unknown" rather than a misleading zero ratio. + const contextLimit = await Provider.getModel(info.providerID, info.modelID) + .then((m) => m.limit.context) + .catch(() => null) + const contextUsed = tokens.input + tokens.cache.read + const context = + contextLimit != null + ? { + used: contextUsed, + limit: contextLimit, + ratio: contextUsed / contextLimit, + } + : null + emit("message_complete", { modelID: info.modelID, providerID: info.providerID, agent: info.agent, + // cost is sourced from info.cost which accumulates real per-step costs + // from StepFinishPart. Do NOT use the new step.ended event cost field + // which emits cost:0 and is reconciled later (the cost:0 trap). cost: info.cost, - tokens: info.tokens, + tokens, + context, finish: info.finish, }) } diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts new file mode 100644 index 0000000..42a786f --- /dev/null +++ b/packages/cli/test/cli/usage-token-breakdown.test.ts @@ -0,0 +1,93 @@ +import path from "path" +import { describe, expect, test } from "bun:test" + +const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts") +const EVENTS_MD = path.resolve(import.meta.dir, "../../../../EVENTS.md") + +/** + * Tests for issue #86 — 5-way token breakdown + context-window utilization + * in message_complete events. + */ +describe("message_complete token breakdown (#86)", () => { + test("message_complete emit block passes tokens with reasoning and cache read/write", async () => { + const source = await Bun.file(RUN_SRC).text() + const emitIdx = source.indexOf('emit("message_complete"') + expect(emitIdx).toBeGreaterThan(-1) + // Look back up to 1500 chars before the emit call to capture variable setup, + // and up to 200 chars after. + const blockStart = Math.max(0, emitIdx - 1500) + const block = source.slice(blockStart, emitIdx + 200) + + // The tokens object must surface all 5 fields from MessageV2.Assistant.tokens + expect(block).toContain("tokens") + // reasoning must be present (not dropped) + expect(block).toContain("reasoning") + // cache read and write must both be present + expect(block).toContain("cache") + expect(block).toContain("read") + expect(block).toContain("write") + }) + + test("message_complete emit block includes context with used, limit, ratio", async () => { + const source = await Bun.file(RUN_SRC).text() + const emitIdx = source.indexOf('emit("message_complete"') + expect(emitIdx).toBeGreaterThan(-1) + const blockStart = Math.max(0, emitIdx - 1500) + const block = source.slice(blockStart, emitIdx + 200) + + expect(block).toContain("context") + expect(block).toContain("used") + expect(block).toContain("limit") + expect(block).toContain("ratio") + }) + + test("context.used is computed as input + cache.read", async () => { + const source = await Bun.file(RUN_SRC).text() + // Should have a computation that adds input and cache.read + // Accept either inline or extracted variable form + expect(source).toMatch(/input\s*\+\s*.*cache.*read|cache.*read.*\+\s*input/) + }) + + test("context.limit is sourced from Provider.getModel context limit", async () => { + const source = await Bun.file(RUN_SRC).text() + // Must call Provider.getModel (or equivalent) to get the model's context limit + expect(source).toMatch(/Provider\.getModel|limit\.context|contextLimit/) + }) + + test("context.ratio is used / limit", async () => { + const source = await Bun.file(RUN_SRC).text() + // ratio must be a division of used by limit + expect(source).toMatch(/ratio.*\/|\/.*ratio|used\s*\/\s*limit|contextLimit/) + }) + + test("cost is still emitted in message_complete (not regressed)", async () => { + const source = await Bun.file(RUN_SRC).text() + const idx = source.indexOf('emit("message_complete"') + expect(idx).toBeGreaterThan(-1) + const block = source.slice(idx, idx + 800) + // cost must remain — sourced from info.cost (real per-step accumulation) + expect(block).toContain("cost:") + }) +}) + +describe("EVENTS.md documents token breakdown and context (#86)", () => { + test("EVENTS.md message_complete section includes reasoning token field", async () => { + const doc = await Bun.file(EVENTS_MD).text() + // Find the message_complete section + const idx = doc.indexOf("message_complete") + expect(idx).toBeGreaterThan(-1) + const section = doc.slice(idx, idx + 1500) + expect(section).toContain("reasoning") + }) + + test("EVENTS.md message_complete section documents context field", async () => { + const doc = await Bun.file(EVENTS_MD).text() + const idx = doc.indexOf("message_complete") + expect(idx).toBeGreaterThan(-1) + const section = doc.slice(idx, idx + 1500) + expect(section).toContain("context") + expect(section).toContain("used") + expect(section).toContain("limit") + expect(section).toContain("ratio") + }) +}) From 7bbf2045d3c8b2d0c33167bff5d74617bd61397d Mon Sep 17 00:00:00 2001 From: Bulat Yapparov Date: Mon, 22 Jun 2026 09:05:01 +0100 Subject: [PATCH 2/4] fix(run): guard contextLimit<=0 to avoid Infinity ratio; extract buildContextWindow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Custom models without a registered context limit default to `limit.context = 0` (provider.ts:929). The old guard `contextLimit != null` passed for 0, causing `ratio = used / 0 = Infinity`, which JSON.stringify serialises as `null` *inside* the context object — diverging from the documented top-level null contract in EVENTS.md ("null — emitted when the model's context limit is not known"). Fix: extract pure helper `buildContextWindow(limit, used)` that returns null when limit is null or <=0. This also makes the computation unit-testable. Replace source-grep tests (which could pass even with wrong logic, per bot review) with 10 behavioural unit tests of `buildContextWindow` covering: null limit, zero limit (🟠 regression case), ratio computation, JSON-serialisability, and the top-level-null contract. Retain slim source-text checks for structural wiring. Fixes review findings from PR #87 (aictrl-dev bot): - 🟠 limit:0 yields Infinity ratio, breaks null contract - 🟡 Tests grep source text instead of running emit path --- packages/cli/src/cli/cmd/run.ts | 37 +++-- .../test/cli/usage-token-breakdown.test.ts | 132 +++++++++++++----- 2 files changed, 124 insertions(+), 45 deletions(-) diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts index 0abb7d2..f26b420 100644 --- a/packages/cli/src/cli/cmd/run.ts +++ b/packages/cli/src/cli/cmd/run.ts @@ -80,6 +80,30 @@ function fallback(part: ToolPart) { }) } +/** + * Build the context-window utilization object for `message_complete` events. + * + * Returns `null` (meaning "unknown") when: + * - `contextLimit` is `null` — Provider.getModel threw (unregistered model) + * - `contextLimit` is `0` — custom model without a registered limit defaults to + * `context: 0` (provider.ts:929). A zero limit would yield `Infinity`/`NaN` for + * ratio, which `JSON.stringify` serialises as `null` inside the object — diverging + * from the documented top-level `null` contract (EVENTS.md). + * + * @internal exported for unit-testing only + */ +export function buildContextWindow( + contextLimit: number | null, + contextUsed: number, +): { used: number; limit: number; ratio: number } | null { + if (contextLimit == null || contextLimit <= 0) return null + return { + used: contextUsed, + limit: contextLimit, + ratio: contextUsed / contextLimit, + } +} + function glob(info: ToolProps) { const root = info.input.path ?? "" const title = `Glob "${info.input.pattern}"` @@ -478,20 +502,13 @@ export const RunCommand = cmd({ // Context-window utilization: used = input + cache.read (prompt tokens // that actually hit the model's context window). limit comes from the - // model registry (models.dev). On lookup failure we emit null to signal - // "unknown" rather than a misleading zero ratio. + // model registry (models.dev). On lookup failure (or limit===0 for + // custom models) buildContextWindow returns null to signal "unknown". const contextLimit = await Provider.getModel(info.providerID, info.modelID) .then((m) => m.limit.context) .catch(() => null) const contextUsed = tokens.input + tokens.cache.read - const context = - contextLimit != null - ? { - used: contextUsed, - limit: contextLimit, - ratio: contextUsed / contextLimit, - } - : null + const context = buildContextWindow(contextLimit, contextUsed) emit("message_complete", { modelID: info.modelID, diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts index 42a786f..e7b696c 100644 --- a/packages/cli/test/cli/usage-token-breakdown.test.ts +++ b/packages/cli/test/cli/usage-token-breakdown.test.ts @@ -1,71 +1,124 @@ import path from "path" import { describe, expect, test } from "bun:test" +import { buildContextWindow } from "../../src/cli/cmd/run" -const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts") const EVENTS_MD = path.resolve(import.meta.dir, "../../../../EVENTS.md") /** * Tests for issue #86 — 5-way token breakdown + context-window utilization * in message_complete events. */ -describe("message_complete token breakdown (#86)", () => { - test("message_complete emit block passes tokens with reasoning and cache read/write", async () => { +describe("buildContextWindow (#86)", () => { + // 🟠 regression: limit===0 (custom model default) must return null, not {ratio:Infinity} + test("returns null when contextLimit is 0 (custom model without registered limit)", () => { + const result = buildContextWindow(0, 9824) + expect(result).toBeNull() + }) + + test("returns null when contextLimit is null (Provider.getModel threw)", () => { + const result = buildContextWindow(null, 9824) + expect(result).toBeNull() + }) + + test("returns null when both limit and used are 0", () => { + const result = buildContextWindow(0, 0) + expect(result).toBeNull() + }) + + test("computes used as the value passed in (caller sets input + cache.read)", () => { + const input = 8000 + const cacheRead = 1824 + const contextUsed = input + cacheRead + const result = buildContextWindow(200_000, contextUsed) + expect(result).not.toBeNull() + expect(result!.used).toBe(9824) + }) + + test("sets limit to the contextLimit value", () => { + const result = buildContextWindow(200_000, 9824) + expect(result).not.toBeNull() + expect(result!.limit).toBe(200_000) + }) + + test("ratio is used / limit", () => { + const result = buildContextWindow(200_000, 9824) + expect(result).not.toBeNull() + expect(result!.ratio).toBeCloseTo(9824 / 200_000, 10) + }) + + test("ratio is between 0 and 1 for realistic values", () => { + const result = buildContextWindow(128_000, 64_000) + expect(result).not.toBeNull() + expect(result!.ratio).toBe(0.5) + }) + + test("ratio is exactly 1 when context is fully used", () => { + const result = buildContextWindow(100_000, 100_000) + expect(result).not.toBeNull() + expect(result!.ratio).toBe(1) + }) + + test("ratio is 0 when no tokens used (empty prompt start)", () => { + const result = buildContextWindow(200_000, 0) + expect(result).not.toBeNull() + expect(result!.ratio).toBe(0) + }) + + test("result is JSON-serialisable without Infinity or NaN", () => { + const result = buildContextWindow(200_000, 9824) + const serialised = JSON.stringify(result) + expect(serialised).not.toContain("null") + const parsed = JSON.parse(serialised) + expect(parsed.ratio).toBeCloseTo(9824 / 200_000, 10) + }) + + test("top-level null serialises cleanly (not as object with null ratio)", () => { + // The documented contract: limit unknown → top-level null, not {ratio:null} + const result = buildContextWindow(0, 9824) + expect(JSON.stringify(result)).toBe("null") + }) +}) + +describe("message_complete emit block shape (source-verified, #86)", () => { + // These source-text checks verify structural wiring in the emit call site + // that cannot be covered by pure unit-testing buildContextWindow. + const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts") + + test("emit block passes tokens with reasoning and cache read/write fields", async () => { const source = await Bun.file(RUN_SRC).text() const emitIdx = source.indexOf('emit("message_complete"') expect(emitIdx).toBeGreaterThan(-1) - // Look back up to 1500 chars before the emit call to capture variable setup, - // and up to 200 chars after. const blockStart = Math.max(0, emitIdx - 1500) const block = source.slice(blockStart, emitIdx + 200) - - // The tokens object must surface all 5 fields from MessageV2.Assistant.tokens - expect(block).toContain("tokens") - // reasoning must be present (not dropped) expect(block).toContain("reasoning") - // cache read and write must both be present expect(block).toContain("cache") expect(block).toContain("read") expect(block).toContain("write") }) - test("message_complete emit block includes context with used, limit, ratio", async () => { + test("emit block calls buildContextWindow (not inline ternary)", async () => { const source = await Bun.file(RUN_SRC).text() const emitIdx = source.indexOf('emit("message_complete"') expect(emitIdx).toBeGreaterThan(-1) const blockStart = Math.max(0, emitIdx - 1500) const block = source.slice(blockStart, emitIdx + 200) - - expect(block).toContain("context") - expect(block).toContain("used") - expect(block).toContain("limit") - expect(block).toContain("ratio") - }) - - test("context.used is computed as input + cache.read", async () => { - const source = await Bun.file(RUN_SRC).text() - // Should have a computation that adds input and cache.read - // Accept either inline or extracted variable form - expect(source).toMatch(/input\s*\+\s*.*cache.*read|cache.*read.*\+\s*input/) - }) - - test("context.limit is sourced from Provider.getModel context limit", async () => { - const source = await Bun.file(RUN_SRC).text() - // Must call Provider.getModel (or equivalent) to get the model's context limit - expect(source).toMatch(/Provider\.getModel|limit\.context|contextLimit/) + expect(block).toContain("buildContextWindow") }) - test("context.ratio is used / limit", async () => { + test("emit block includes context field", async () => { const source = await Bun.file(RUN_SRC).text() - // ratio must be a division of used by limit - expect(source).toMatch(/ratio.*\/|\/.*ratio|used\s*\/\s*limit|contextLimit/) + const emitIdx = source.indexOf('emit("message_complete"') + expect(emitIdx).toBeGreaterThan(-1) + // emit object spans ~400 chars; search up to closing paren + const block = source.slice(emitIdx, emitIdx + 500) + expect(block).toContain("context") }) - test("cost is still emitted in message_complete (not regressed)", async () => { + test("cost field is still emitted (not regressed)", async () => { const source = await Bun.file(RUN_SRC).text() const idx = source.indexOf('emit("message_complete"') expect(idx).toBeGreaterThan(-1) const block = source.slice(idx, idx + 800) - // cost must remain — sourced from info.cost (real per-step accumulation) expect(block).toContain("cost:") }) }) @@ -73,7 +126,6 @@ describe("message_complete token breakdown (#86)", () => { describe("EVENTS.md documents token breakdown and context (#86)", () => { test("EVENTS.md message_complete section includes reasoning token field", async () => { const doc = await Bun.file(EVENTS_MD).text() - // Find the message_complete section const idx = doc.indexOf("message_complete") expect(idx).toBeGreaterThan(-1) const section = doc.slice(idx, idx + 1500) @@ -90,4 +142,14 @@ describe("EVENTS.md documents token breakdown and context (#86)", () => { expect(section).toContain("limit") expect(section).toContain("ratio") }) + + test("EVENTS.md documents null as the unknown-limit sentinel", async () => { + const doc = await Bun.file(EVENTS_MD).text() + const idx = doc.indexOf("message_complete") + expect(idx).toBeGreaterThan(-1) + // null sentinel doc is ~1593 chars after message_complete heading; use 2000 window + const section = doc.slice(idx, idx + 2000) + // The documented contract: null = context limit not known + expect(section).toContain("null") + }) }) From 7e99d4c0d34421655dc838e9d9904e4c6097bf2b Mon Sep 17 00:00:00 2001 From: Bulat Yapparov Date: Mon, 22 Jun 2026 10:58:47 +0100 Subject: [PATCH 3/4] fix(run): targeted catch for ModelNotFoundError; doc fixes for ratio range and example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - .catch(()=>null) narrowed to catch only Provider.ModelNotFoundError and rethrow unexpected errors — avoids silently swallowing registry/programming faults - EVENTS.md: ratio range updated from "(0–1)" to "(≥0; may exceed 1)" to match the unclamped division; example ratio fixed to 0.04912 (was rounded 0.049) - JSDoc provider.ts:929 hardcoded line citation replaced with behavioral description - Tests: +ratio-exceeds-1 unit test; +source-verified targeted-catch regression test --- EVENTS.md | 4 ++-- packages/cli/src/cli/cmd/run.ts | 12 +++++++---- .../test/cli/usage-token-breakdown.test.ts | 20 +++++++++++++++++++ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/EVENTS.md b/EVENTS.md index 9c456cb..005b26d 100644 --- a/EVENTS.md +++ b/EVENTS.md @@ -88,7 +88,7 @@ Emitted when an assistant message finishes (one per LLM turn). "reasoning": 0, "cache": { "read": 8800, "write": 1024 } }, - "context": { "used": 9824, "limit": 200000, "ratio": 0.049 }, + "context": { "used": 9824, "limit": 200000, "ratio": 0.04912 }, "finish": "tool-calls" } ``` @@ -109,7 +109,7 @@ These fields are non-overlapping: a token is counted in exactly one bucket. - `used` (number) — tokens occupying the model's context window this turn: `input + cache.read`. - `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`). -- `ratio` (number) — `used / limit` (0–1). A value approaching 1 signals context-exhaustion risk. +- `ratio` (number) — `used / limit` (≥0; may exceed 1 if usage exceeds the model's registered limit). A value approaching or exceeding 1 signals context-exhaustion risk. - `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint). ### `text` diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts index f26b420..8a6264c 100644 --- a/packages/cli/src/cli/cmd/run.ts +++ b/packages/cli/src/cli/cmd/run.ts @@ -86,9 +86,10 @@ function fallback(part: ToolPart) { * Returns `null` (meaning "unknown") when: * - `contextLimit` is `null` — Provider.getModel threw (unregistered model) * - `contextLimit` is `0` — custom model without a registered limit defaults to - * `context: 0` (provider.ts:929). A zero limit would yield `Infinity`/`NaN` for - * ratio, which `JSON.stringify` serialises as `null` inside the object — diverging - * from the documented top-level `null` contract (EVENTS.md). + * `limit.context = 0` (the provider's default for unregistered custom models). + * A zero limit would yield `Infinity`/`NaN` for ratio, which `JSON.stringify` + * serialises as `null` inside the object — diverging from the documented + * top-level `null` contract (EVENTS.md). * * @internal exported for unit-testing only */ @@ -506,7 +507,10 @@ export const RunCommand = cmd({ // custom models) buildContextWindow returns null to signal "unknown". const contextLimit = await Provider.getModel(info.providerID, info.modelID) .then((m) => m.limit.context) - .catch(() => null) + .catch((e) => { + if (e instanceof Provider.ModelNotFoundError) return null + throw e + }) const contextUsed = tokens.input + tokens.cache.read const context = buildContextWindow(contextLimit, contextUsed) diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts index e7b696c..09a286c 100644 --- a/packages/cli/test/cli/usage-token-breakdown.test.ts +++ b/packages/cli/test/cli/usage-token-breakdown.test.ts @@ -77,6 +77,15 @@ describe("buildContextWindow (#86)", () => { const result = buildContextWindow(0, 9824) expect(JSON.stringify(result)).toBe("null") }) + + test("ratio may exceed 1 when usage exceeds the registered limit (unclamped)", () => { + // EVENTS.md documents ratio as ≥0 (may exceed 1), not clamped to [0,1]. + // Stale/lowered model.limit.context can produce ratio > 1 in production. + const result = buildContextWindow(100_000, 110_000) + expect(result).not.toBeNull() + expect(result!.ratio).toBeGreaterThan(1) + expect(result!.ratio).toBeCloseTo(1.1, 10) + }) }) describe("message_complete emit block shape (source-verified, #86)", () => { @@ -121,6 +130,17 @@ describe("message_complete emit block shape (source-verified, #86)", () => { const block = source.slice(idx, idx + 800) expect(block).toContain("cost:") }) + + test("getModel catch rethrows non-ModelNotFoundError (targeted catch, not swallow-all)", async () => { + // Verify the catch block only silences ModelNotFoundError; unexpected errors must propagate. + // Source check: catch body must reference ModelNotFoundError (not be an empty arrow). + const source = await Bun.file(RUN_SRC).text() + const getModelIdx = source.indexOf("Provider.getModel(info.providerID") + expect(getModelIdx).toBeGreaterThan(-1) + const catchWindow = source.slice(getModelIdx, getModelIdx + 400) + expect(catchWindow).toContain("ModelNotFoundError") + expect(catchWindow).toContain("throw e") + }) }) describe("EVENTS.md documents token breakdown and context (#86)", () => { From b5babcbdc1f0672fd889961a95c31697cae5ed45 Mon Sep 17 00:00:00 2001 From: Bulat Yapparov Date: Mon, 22 Jun 2026 11:13:31 +0100 Subject: [PATCH 4/4] fix(run): include cache.write in contextUsed for context-window utilization cache.write tokens are part of the prompt sent to the model on the current turn (they occupy the context window, just billed at the cache-write rate). Omitting them undercounted utilization most on the first turn, where a large prefix is written to the cache. Fix: contextUsed = input + cache.read + cache.write. EVENTS.md: update used definition and example (used: 10848, ratio: 0.05424). Tests: regression test verifying three-way sum at call site. Co-Authored-By: Claude Sonnet 4.6 Claude-Session: https://claude.ai/code/session_0187MsfK1upr6K2BKVbmaebQ --- EVENTS.md | 4 +-- packages/cli/src/cli/cmd/run.ts | 14 ++++++---- .../test/cli/usage-token-breakdown.test.ts | 26 +++++++++++++++---- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/EVENTS.md b/EVENTS.md index 005b26d..696add3 100644 --- a/EVENTS.md +++ b/EVENTS.md @@ -88,7 +88,7 @@ Emitted when an assistant message finishes (one per LLM turn). "reasoning": 0, "cache": { "read": 8800, "write": 1024 } }, - "context": { "used": 9824, "limit": 200000, "ratio": 0.04912 }, + "context": { "used": 10848, "limit": 200000, "ratio": 0.05424 }, "finish": "tool-calls" } ``` @@ -107,7 +107,7 @@ These fields are non-overlapping: a token is counted in exactly one bucket. **`context`** (context-window utilization): -- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read`. +- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read + cache.write`. - `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`). - `ratio` (number) — `used / limit` (≥0; may exceed 1 if usage exceeds the model's registered limit). A value approaching or exceeding 1 signals context-exhaustion risk. - `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint). diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts index 8a6264c..754149a 100644 --- a/packages/cli/src/cli/cmd/run.ts +++ b/packages/cli/src/cli/cmd/run.ts @@ -501,17 +501,21 @@ export const RunCommand = cmd({ }, } - // Context-window utilization: used = input + cache.read (prompt tokens - // that actually hit the model's context window). limit comes from the - // model registry (models.dev). On lookup failure (or limit===0 for - // custom models) buildContextWindow returns null to signal "unknown". + // Context-window utilization: used = input + cache.read + cache.write + // (all prompt tokens that occupy the model's context window this turn). + // cache.write tokens are written to the cache ON this turn — they are + // part of the prompt sent to the model and count against the context + // window, just billed at the cache-write rate. Excluding them + // undercounts utilization on the first turn of a conversation. + // limit comes from the model registry (models.dev). On lookup failure + // (or limit===0 for custom models) buildContextWindow returns null. const contextLimit = await Provider.getModel(info.providerID, info.modelID) .then((m) => m.limit.context) .catch((e) => { if (e instanceof Provider.ModelNotFoundError) return null throw e }) - const contextUsed = tokens.input + tokens.cache.read + const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write const context = buildContextWindow(contextLimit, contextUsed) emit("message_complete", { diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts index 09a286c..cf68baf 100644 --- a/packages/cli/test/cli/usage-token-breakdown.test.ts +++ b/packages/cli/test/cli/usage-token-breakdown.test.ts @@ -25,13 +25,17 @@ describe("buildContextWindow (#86)", () => { expect(result).toBeNull() }) - test("computes used as the value passed in (caller sets input + cache.read)", () => { - const input = 8000 - const cacheRead = 1824 - const contextUsed = input + cacheRead + test("computes used as the value passed in (caller sets input + cache.read + cache.write)", () => { + // Regression: cache.write must be included in contextUsed at the call site (run.ts). + // buildContextWindow receives the pre-summed value; this test verifies the helper + // honours it (and that the sum is documented correctly: input + cache.read + cache.write). + const input = 1024 + const cacheRead = 8800 + const cacheWrite = 1024 + const contextUsed = input + cacheRead + cacheWrite // = 10848 const result = buildContextWindow(200_000, contextUsed) expect(result).not.toBeNull() - expect(result!.used).toBe(9824) + expect(result!.used).toBe(10848) }) test("sets limit to the contextLimit value", () => { @@ -141,6 +145,18 @@ describe("message_complete emit block shape (source-verified, #86)", () => { expect(catchWindow).toContain("ModelNotFoundError") expect(catchWindow).toContain("throw e") }) + + test("contextUsed includes cache.write (regression: must not omit cache.write from context sum)", async () => { + // Regression for the bug where `contextUsed = tokens.input + tokens.cache.read` + // omitted cache.write, undercounting first-turn utilization. The fix is: + // const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write + // This source-text check verifies the three-way sum is present at the call site. + const source = await Bun.file(RUN_SRC).text() + const contextUsedIdx = source.indexOf("const contextUsed =") + expect(contextUsedIdx).toBeGreaterThan(-1) + const line = source.slice(contextUsedIdx, contextUsedIdx + 100) + expect(line).toContain("cache.write") + }) }) describe("EVENTS.md documents token breakdown and context (#86)", () => {