From 7c804223f78939665f0d34c0b3f1d26449c51f28 Mon Sep 17 00:00:00 2001
From: Bulat Yapparov <by4pparov@yandex.ru>
Date: Mon, 22 Jun 2026 04:40:32 +0100
Subject: [PATCH 1/4] feat(events): emit 5-way token breakdown + context-window
 utilization in message_complete (#86)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Expand `tokens` in `message_complete` from an opaque `info.tokens` passthrough
  to an explicit object with all 5 fields: input / output / reasoning /
  cache.read / cache.write — mirroring upstream LLM.Usage shape.
  The data was already captured in MessageV2.Assistant.tokens via StepFinishPart
  accumulation; this change surfaces it explicitly.

- Add `context: { used, limit, ratio }` to `message_complete`:
  - `used = input + cache.read` (tokens occupying the context window this turn)
  - `limit` sourced from Provider.getModel() → model.limit.context (models.dev)
  - `ratio = used / limit`; emits `null` when limit is unknown (unregistered endpoint)

- Cost kept correct: `info.cost` accumulates real per-step cost from StepFinishPart,
  NOT from the new step.ended event which emits cost:0 (the cost:0 trap).

- Update EVENTS.md with the extended schema and field-by-field documentation.

- Add TDD test file (RED→GREEN): `test/cli/usage-token-breakdown.test.ts`.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0187MsfK1upr6K2BKVbmaebQ
---
 EVENTS.md                                     | 25 ++++-
 packages/cli/src/cli/cmd/run.ts               | 36 ++++++-
 .../test/cli/usage-token-breakdown.test.ts    | 93 +++++++++++++++++++
 3 files changed, 152 insertions(+), 2 deletions(-)
 create mode 100644 packages/cli/test/cli/usage-token-breakdown.test.ts

diff --git a/EVENTS.md b/EVENTS.md
index 45a6ff3..9c456cb 100644
--- a/EVENTS.md
+++ b/EVENTS.md
@@ -82,13 +82,36 @@ Emitted when an assistant message finishes (one per LLM turn).
   "providerID": "anthropic",
   "agent": "default",
   "cost": { "input": 0.003, "output": 0.012, "cache": { "read": 0, "write": 0 } },
-  "tokens": { "input": 1024, "output": 512, "cache": { "read": 0, "write": 0 } },
+  "tokens": {
+    "input": 1024,
+    "output": 512,
+    "reasoning": 0,
+    "cache": { "read": 8800, "write": 1024 }
+  },
+  "context": { "used": 9824, "limit": 200000, "ratio": 0.049 },
   "finish": "tool-calls"
 }
 ```
 
 `finish` values: `"tool-calls"` (model wants to call tools), `"end_turn"` (model is done), `"max_tokens"` (output truncated).
 
+**`tokens`** (5-way breakdown, mirrors upstream `LLM.Usage`):
+
+- `input` (number) — raw input tokens billed at the standard input rate.
+- `output` (number) — output (completion) tokens.
+- `reasoning` (number) — extended-thinking / reasoning tokens (0 when thinking is off).
+- `cache.read` (number) — tokens served from the prompt cache (billed at cache-read rate). Distinguishes cache hits from fresh input.
+- `cache.write` (number) — tokens written to the prompt cache (billed at cache-write rate).
+
+These fields are non-overlapping: a token is counted in exactly one bucket.
+
+**`context`** (context-window utilization):
+
+- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read`.
+- `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`).
+- `ratio` (number) — `used / limit` (0–1). A value approaching 1 signals context-exhaustion risk.
+- `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint).
+
 ### `text`
 
 Emitted when a text block from the assistant is complete.
diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts
index a4cfb2b..0abb7d2 100644
--- a/packages/cli/src/cli/cmd/run.ts
+++ b/packages/cli/src/cli/cmd/run.ts
@@ -463,12 +463,46 @@ export const RunCommand = cmd({
             const info = event.properties.info
             if (args.format === "json") {
               if (info.finish) {
+                // Build 5-way token breakdown mirroring upstream LLM.Usage shape.
+                // info.tokens already carries the full breakdown from StepFinishPart
+                // accumulation — reasoning and cache split are not dropped upstream.
+                const tokens = {
+                  input: info.tokens.input,
+                  output: info.tokens.output,
+                  reasoning: info.tokens.reasoning,
+                  cache: {
+                    read: info.tokens.cache.read,
+                    write: info.tokens.cache.write,
+                  },
+                }
+
+                // Context-window utilization: used = input + cache.read (prompt tokens
+                // that actually hit the model's context window). limit comes from the
+                // model registry (models.dev). On lookup failure we emit null to signal
+                // "unknown" rather than a misleading zero ratio.
+                const contextLimit = await Provider.getModel(info.providerID, info.modelID)
+                  .then((m) => m.limit.context)
+                  .catch(() => null)
+                const contextUsed = tokens.input + tokens.cache.read
+                const context =
+                  contextLimit != null
+                    ? {
+                        used: contextUsed,
+                        limit: contextLimit,
+                        ratio: contextUsed / contextLimit,
+                      }
+                    : null
+
                 emit("message_complete", {
                   modelID: info.modelID,
                   providerID: info.providerID,
                   agent: info.agent,
+                  // cost is sourced from info.cost which accumulates real per-step costs
+                  // from StepFinishPart. Do NOT use the new step.ended event cost field
+                  // which emits cost:0 and is reconciled later (the cost:0 trap).
                   cost: info.cost,
-                  tokens: info.tokens,
+                  tokens,
+                  context,
                   finish: info.finish,
                 })
               }
diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts
new file mode 100644
index 0000000..42a786f
--- /dev/null
+++ b/packages/cli/test/cli/usage-token-breakdown.test.ts
@@ -0,0 +1,93 @@
+import path from "path"
+import { describe, expect, test } from "bun:test"
+
+const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts")
+const EVENTS_MD = path.resolve(import.meta.dir, "../../../../EVENTS.md")
+
+/**
+ * Tests for issue #86 — 5-way token breakdown + context-window utilization
+ * in message_complete events.
+ */
+describe("message_complete token breakdown (#86)", () => {
+  test("message_complete emit block passes tokens with reasoning and cache read/write", async () => {
+    const source = await Bun.file(RUN_SRC).text()
+    const emitIdx = source.indexOf('emit("message_complete"')
+    expect(emitIdx).toBeGreaterThan(-1)
+    // Look back up to 1500 chars before the emit call to capture variable setup,
+    // and up to 200 chars after.
+    const blockStart = Math.max(0, emitIdx - 1500)
+    const block = source.slice(blockStart, emitIdx + 200)
+
+    // The tokens object must surface all 5 fields from MessageV2.Assistant.tokens
+    expect(block).toContain("tokens")
+    // reasoning must be present (not dropped)
+    expect(block).toContain("reasoning")
+    // cache read and write must both be present
+    expect(block).toContain("cache")
+    expect(block).toContain("read")
+    expect(block).toContain("write")
+  })
+
+  test("message_complete emit block includes context with used, limit, ratio", async () => {
+    const source = await Bun.file(RUN_SRC).text()
+    const emitIdx = source.indexOf('emit("message_complete"')
+    expect(emitIdx).toBeGreaterThan(-1)
+    const blockStart = Math.max(0, emitIdx - 1500)
+    const block = source.slice(blockStart, emitIdx + 200)
+
+    expect(block).toContain("context")
+    expect(block).toContain("used")
+    expect(block).toContain("limit")
+    expect(block).toContain("ratio")
+  })
+
+  test("context.used is computed as input + cache.read", async () => {
+    const source = await Bun.file(RUN_SRC).text()
+    // Should have a computation that adds input and cache.read
+    // Accept either inline or extracted variable form
+    expect(source).toMatch(/input\s*\+\s*.*cache.*read|cache.*read.*\+\s*input/)
+  })
+
+  test("context.limit is sourced from Provider.getModel context limit", async () => {
+    const source = await Bun.file(RUN_SRC).text()
+    // Must call Provider.getModel (or equivalent) to get the model's context limit
+    expect(source).toMatch(/Provider\.getModel|limit\.context|contextLimit/)
+  })
+
+  test("context.ratio is used / limit", async () => {
+    const source = await Bun.file(RUN_SRC).text()
+    // ratio must be a division of used by limit
+    expect(source).toMatch(/ratio.*\/|\/.*ratio|used\s*\/\s*limit|contextLimit/)
+  })
+
+  test("cost is still emitted in message_complete (not regressed)", async () => {
+    const source = await Bun.file(RUN_SRC).text()
+    const idx = source.indexOf('emit("message_complete"')
+    expect(idx).toBeGreaterThan(-1)
+    const block = source.slice(idx, idx + 800)
+    // cost must remain — sourced from info.cost (real per-step accumulation)
+    expect(block).toContain("cost:")
+  })
+})
+
+describe("EVENTS.md documents token breakdown and context (#86)", () => {
+  test("EVENTS.md message_complete section includes reasoning token field", async () => {
+    const doc = await Bun.file(EVENTS_MD).text()
+    // Find the message_complete section
+    const idx = doc.indexOf("message_complete")
+    expect(idx).toBeGreaterThan(-1)
+    const section = doc.slice(idx, idx + 1500)
+    expect(section).toContain("reasoning")
+  })
+
+  test("EVENTS.md message_complete section documents context field", async () => {
+    const doc = await Bun.file(EVENTS_MD).text()
+    const idx = doc.indexOf("message_complete")
+    expect(idx).toBeGreaterThan(-1)
+    const section = doc.slice(idx, idx + 1500)
+    expect(section).toContain("context")
+    expect(section).toContain("used")
+    expect(section).toContain("limit")
+    expect(section).toContain("ratio")
+  })
+})

From 7bbf2045d3c8b2d0c33167bff5d74617bd61397d Mon Sep 17 00:00:00 2001
From: Bulat Yapparov <by4pparov@yandex.ru>
Date: Mon, 22 Jun 2026 09:05:01 +0100
Subject: [PATCH 2/4] fix(run): guard contextLimit<=0 to avoid Infinity ratio;
 extract buildContextWindow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Custom models without a registered context limit default to `limit.context = 0`
(provider.ts:929). The old guard `contextLimit != null` passed for 0, causing
`ratio = used / 0 = Infinity`, which JSON.stringify serialises as `null` *inside*
the context object — diverging from the documented top-level null contract in
EVENTS.md ("null — emitted when the model's context limit is not known").

Fix: extract pure helper `buildContextWindow(limit, used)` that returns null when
limit is null or <=0. This also makes the computation unit-testable.

Replace source-grep tests (which could pass even with wrong logic, per bot review)
with 10 behavioural unit tests of `buildContextWindow` covering: null limit, zero
limit (🟠 regression case), ratio computation, JSON-serialisability, and the
top-level-null contract. Retain slim source-text checks for structural wiring.

Fixes review findings from PR #87 (aictrl-dev bot):
- 🟠 limit:0 yields Infinity ratio, breaks null contract
- 🟡 Tests grep source text instead of running emit path
---
 packages/cli/src/cli/cmd/run.ts               |  37 +++--
 .../test/cli/usage-token-breakdown.test.ts    | 132 +++++++++++++-----
 2 files changed, 124 insertions(+), 45 deletions(-)

diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts
index 0abb7d2..f26b420 100644
--- a/packages/cli/src/cli/cmd/run.ts
+++ b/packages/cli/src/cli/cmd/run.ts
@@ -80,6 +80,30 @@ function fallback(part: ToolPart) {
   })
 }
 
+/**
+ * Build the context-window utilization object for `message_complete` events.
+ *
+ * Returns `null` (meaning "unknown") when:
+ * - `contextLimit` is `null` — Provider.getModel threw (unregistered model)
+ * - `contextLimit` is `0`   — custom model without a registered limit defaults to
+ *   `context: 0` (provider.ts:929). A zero limit would yield `Infinity`/`NaN` for
+ *   ratio, which `JSON.stringify` serialises as `null` inside the object — diverging
+ *   from the documented top-level `null` contract (EVENTS.md).
+ *
+ * @internal exported for unit-testing only
+ */
+export function buildContextWindow(
+  contextLimit: number | null,
+  contextUsed: number,
+): { used: number; limit: number; ratio: number } | null {
+  if (contextLimit == null || contextLimit <= 0) return null
+  return {
+    used: contextUsed,
+    limit: contextLimit,
+    ratio: contextUsed / contextLimit,
+  }
+}
+
 function glob(info: ToolProps<typeof GlobTool>) {
   const root = info.input.path ?? ""
   const title = `Glob "${info.input.pattern}"`
@@ -478,20 +502,13 @@ export const RunCommand = cmd({
 
                 // Context-window utilization: used = input + cache.read (prompt tokens
                 // that actually hit the model's context window). limit comes from the
-                // model registry (models.dev). On lookup failure we emit null to signal
-                // "unknown" rather than a misleading zero ratio.
+                // model registry (models.dev). On lookup failure (or limit===0 for
+                // custom models) buildContextWindow returns null to signal "unknown".
                 const contextLimit = await Provider.getModel(info.providerID, info.modelID)
                   .then((m) => m.limit.context)
                   .catch(() => null)
                 const contextUsed = tokens.input + tokens.cache.read
-                const context =
-                  contextLimit != null
-                    ? {
-                        used: contextUsed,
-                        limit: contextLimit,
-                        ratio: contextUsed / contextLimit,
-                      }
-                    : null
+                const context = buildContextWindow(contextLimit, contextUsed)
 
                 emit("message_complete", {
                   modelID: info.modelID,
diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts
index 42a786f..e7b696c 100644
--- a/packages/cli/test/cli/usage-token-breakdown.test.ts
+++ b/packages/cli/test/cli/usage-token-breakdown.test.ts
@@ -1,71 +1,124 @@
 import path from "path"
 import { describe, expect, test } from "bun:test"
+import { buildContextWindow } from "../../src/cli/cmd/run"
 
-const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts")
 const EVENTS_MD = path.resolve(import.meta.dir, "../../../../EVENTS.md")
 
 /**
  * Tests for issue #86 — 5-way token breakdown + context-window utilization
  * in message_complete events.
  */
-describe("message_complete token breakdown (#86)", () => {
-  test("message_complete emit block passes tokens with reasoning and cache read/write", async () => {
+describe("buildContextWindow (#86)", () => {
+  // 🟠 regression: limit===0 (custom model default) must return null, not {ratio:Infinity}
+  test("returns null when contextLimit is 0 (custom model without registered limit)", () => {
+    const result = buildContextWindow(0, 9824)
+    expect(result).toBeNull()
+  })
+
+  test("returns null when contextLimit is null (Provider.getModel threw)", () => {
+    const result = buildContextWindow(null, 9824)
+    expect(result).toBeNull()
+  })
+
+  test("returns null when both limit and used are 0", () => {
+    const result = buildContextWindow(0, 0)
+    expect(result).toBeNull()
+  })
+
+  test("computes used as the value passed in (caller sets input + cache.read)", () => {
+    const input = 8000
+    const cacheRead = 1824
+    const contextUsed = input + cacheRead
+    const result = buildContextWindow(200_000, contextUsed)
+    expect(result).not.toBeNull()
+    expect(result!.used).toBe(9824)
+  })
+
+  test("sets limit to the contextLimit value", () => {
+    const result = buildContextWindow(200_000, 9824)
+    expect(result).not.toBeNull()
+    expect(result!.limit).toBe(200_000)
+  })
+
+  test("ratio is used / limit", () => {
+    const result = buildContextWindow(200_000, 9824)
+    expect(result).not.toBeNull()
+    expect(result!.ratio).toBeCloseTo(9824 / 200_000, 10)
+  })
+
+  test("ratio is between 0 and 1 for realistic values", () => {
+    const result = buildContextWindow(128_000, 64_000)
+    expect(result).not.toBeNull()
+    expect(result!.ratio).toBe(0.5)
+  })
+
+  test("ratio is exactly 1 when context is fully used", () => {
+    const result = buildContextWindow(100_000, 100_000)
+    expect(result).not.toBeNull()
+    expect(result!.ratio).toBe(1)
+  })
+
+  test("ratio is 0 when no tokens used (empty prompt start)", () => {
+    const result = buildContextWindow(200_000, 0)
+    expect(result).not.toBeNull()
+    expect(result!.ratio).toBe(0)
+  })
+
+  test("result is JSON-serialisable without Infinity or NaN", () => {
+    const result = buildContextWindow(200_000, 9824)
+    const serialised = JSON.stringify(result)
+    expect(serialised).not.toContain("null")
+    const parsed = JSON.parse(serialised)
+    expect(parsed.ratio).toBeCloseTo(9824 / 200_000, 10)
+  })
+
+  test("top-level null serialises cleanly (not as object with null ratio)", () => {
+    // The documented contract: limit unknown → top-level null, not {ratio:null}
+    const result = buildContextWindow(0, 9824)
+    expect(JSON.stringify(result)).toBe("null")
+  })
+})
+
+describe("message_complete emit block shape (source-verified, #86)", () => {
+  // These source-text checks verify structural wiring in the emit call site
+  // that cannot be covered by pure unit-testing buildContextWindow.
+  const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts")
+
+  test("emit block passes tokens with reasoning and cache read/write fields", async () => {
     const source = await Bun.file(RUN_SRC).text()
     const emitIdx = source.indexOf('emit("message_complete"')
     expect(emitIdx).toBeGreaterThan(-1)
-    // Look back up to 1500 chars before the emit call to capture variable setup,
-    // and up to 200 chars after.
     const blockStart = Math.max(0, emitIdx - 1500)
     const block = source.slice(blockStart, emitIdx + 200)
-
-    // The tokens object must surface all 5 fields from MessageV2.Assistant.tokens
-    expect(block).toContain("tokens")
-    // reasoning must be present (not dropped)
     expect(block).toContain("reasoning")
-    // cache read and write must both be present
     expect(block).toContain("cache")
     expect(block).toContain("read")
     expect(block).toContain("write")
   })
 
-  test("message_complete emit block includes context with used, limit, ratio", async () => {
+  test("emit block calls buildContextWindow (not inline ternary)", async () => {
     const source = await Bun.file(RUN_SRC).text()
     const emitIdx = source.indexOf('emit("message_complete"')
     expect(emitIdx).toBeGreaterThan(-1)
     const blockStart = Math.max(0, emitIdx - 1500)
     const block = source.slice(blockStart, emitIdx + 200)
-
-    expect(block).toContain("context")
-    expect(block).toContain("used")
-    expect(block).toContain("limit")
-    expect(block).toContain("ratio")
-  })
-
-  test("context.used is computed as input + cache.read", async () => {
-    const source = await Bun.file(RUN_SRC).text()
-    // Should have a computation that adds input and cache.read
-    // Accept either inline or extracted variable form
-    expect(source).toMatch(/input\s*\+\s*.*cache.*read|cache.*read.*\+\s*input/)
-  })
-
-  test("context.limit is sourced from Provider.getModel context limit", async () => {
-    const source = await Bun.file(RUN_SRC).text()
-    // Must call Provider.getModel (or equivalent) to get the model's context limit
-    expect(source).toMatch(/Provider\.getModel|limit\.context|contextLimit/)
+    expect(block).toContain("buildContextWindow")
   })
 
-  test("context.ratio is used / limit", async () => {
+  test("emit block includes context field", async () => {
     const source = await Bun.file(RUN_SRC).text()
-    // ratio must be a division of used by limit
-    expect(source).toMatch(/ratio.*\/|\/.*ratio|used\s*\/\s*limit|contextLimit/)
+    const emitIdx = source.indexOf('emit("message_complete"')
+    expect(emitIdx).toBeGreaterThan(-1)
+    // emit object spans ~400 chars; search up to closing paren
+    const block = source.slice(emitIdx, emitIdx + 500)
+    expect(block).toContain("context")
   })
 
-  test("cost is still emitted in message_complete (not regressed)", async () => {
+  test("cost field is still emitted (not regressed)", async () => {
     const source = await Bun.file(RUN_SRC).text()
     const idx = source.indexOf('emit("message_complete"')
     expect(idx).toBeGreaterThan(-1)
     const block = source.slice(idx, idx + 800)
-    // cost must remain — sourced from info.cost (real per-step accumulation)
     expect(block).toContain("cost:")
   })
 })
@@ -73,7 +126,6 @@ describe("message_complete token breakdown (#86)", () => {
 describe("EVENTS.md documents token breakdown and context (#86)", () => {
   test("EVENTS.md message_complete section includes reasoning token field", async () => {
     const doc = await Bun.file(EVENTS_MD).text()
-    // Find the message_complete section
     const idx = doc.indexOf("message_complete")
     expect(idx).toBeGreaterThan(-1)
     const section = doc.slice(idx, idx + 1500)
@@ -90,4 +142,14 @@ describe("EVENTS.md documents token breakdown and context (#86)", () => {
     expect(section).toContain("limit")
     expect(section).toContain("ratio")
   })
+
+  test("EVENTS.md documents null as the unknown-limit sentinel", async () => {
+    const doc = await Bun.file(EVENTS_MD).text()
+    const idx = doc.indexOf("message_complete")
+    expect(idx).toBeGreaterThan(-1)
+    // null sentinel doc is ~1593 chars after message_complete heading; use 2000 window
+    const section = doc.slice(idx, idx + 2000)
+    // The documented contract: null = context limit not known
+    expect(section).toContain("null")
+  })
 })

From 7e99d4c0d34421655dc838e9d9904e4c6097bf2b Mon Sep 17 00:00:00 2001
From: Bulat Yapparov <by4pparov@yandex.ru>
Date: Mon, 22 Jun 2026 10:58:47 +0100
Subject: [PATCH 3/4] fix(run): targeted catch for ModelNotFoundError; doc
 fixes for ratio range and example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- .catch(()=>null) narrowed to catch only Provider.ModelNotFoundError and rethrow
  unexpected errors — avoids silently swallowing registry/programming faults
- EVENTS.md: ratio range updated from "(0–1)" to "(≥0; may exceed 1)" to match
  the unclamped division; example ratio fixed to 0.04912 (was rounded 0.049)
- JSDoc provider.ts:929 hardcoded line citation replaced with behavioral description
- Tests: +ratio-exceeds-1 unit test; +source-verified targeted-catch regression test
---
 EVENTS.md                                     |  4 ++--
 packages/cli/src/cli/cmd/run.ts               | 12 +++++++----
 .../test/cli/usage-token-breakdown.test.ts    | 20 +++++++++++++++++++
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/EVENTS.md b/EVENTS.md
index 9c456cb..005b26d 100644
--- a/EVENTS.md
+++ b/EVENTS.md
@@ -88,7 +88,7 @@ Emitted when an assistant message finishes (one per LLM turn).
     "reasoning": 0,
     "cache": { "read": 8800, "write": 1024 }
   },
-  "context": { "used": 9824, "limit": 200000, "ratio": 0.049 },
+  "context": { "used": 9824, "limit": 200000, "ratio": 0.04912 },
   "finish": "tool-calls"
 }
 ```
@@ -109,7 +109,7 @@ These fields are non-overlapping: a token is counted in exactly one bucket.
 
 - `used` (number) — tokens occupying the model's context window this turn: `input + cache.read`.
 - `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`).
-- `ratio` (number) — `used / limit` (0–1). A value approaching 1 signals context-exhaustion risk.
+- `ratio` (number) — `used / limit` (≥0; may exceed 1 if usage exceeds the model's registered limit). A value approaching or exceeding 1 signals context-exhaustion risk.
 - `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint).
 
 ### `text`
diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts
index f26b420..8a6264c 100644
--- a/packages/cli/src/cli/cmd/run.ts
+++ b/packages/cli/src/cli/cmd/run.ts
@@ -86,9 +86,10 @@ function fallback(part: ToolPart) {
  * Returns `null` (meaning "unknown") when:
  * - `contextLimit` is `null` — Provider.getModel threw (unregistered model)
  * - `contextLimit` is `0`   — custom model without a registered limit defaults to
- *   `context: 0` (provider.ts:929). A zero limit would yield `Infinity`/`NaN` for
- *   ratio, which `JSON.stringify` serialises as `null` inside the object — diverging
- *   from the documented top-level `null` contract (EVENTS.md).
+ *   `limit.context = 0` (the provider's default for unregistered custom models).
+ *   A zero limit would yield `Infinity`/`NaN` for ratio, which `JSON.stringify`
+ *   serialises as `null` inside the object — diverging from the documented
+ *   top-level `null` contract (EVENTS.md).
  *
  * @internal exported for unit-testing only
  */
@@ -506,7 +507,10 @@ export const RunCommand = cmd({
                 // custom models) buildContextWindow returns null to signal "unknown".
                 const contextLimit = await Provider.getModel(info.providerID, info.modelID)
                   .then((m) => m.limit.context)
-                  .catch(() => null)
+                  .catch((e) => {
+                    if (e instanceof Provider.ModelNotFoundError) return null
+                    throw e
+                  })
                 const contextUsed = tokens.input + tokens.cache.read
                 const context = buildContextWindow(contextLimit, contextUsed)
 
diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts
index e7b696c..09a286c 100644
--- a/packages/cli/test/cli/usage-token-breakdown.test.ts
+++ b/packages/cli/test/cli/usage-token-breakdown.test.ts
@@ -77,6 +77,15 @@ describe("buildContextWindow (#86)", () => {
     const result = buildContextWindow(0, 9824)
     expect(JSON.stringify(result)).toBe("null")
   })
+
+  test("ratio may exceed 1 when usage exceeds the registered limit (unclamped)", () => {
+    // EVENTS.md documents ratio as ≥0 (may exceed 1), not clamped to [0,1].
+    // Stale/lowered model.limit.context can produce ratio > 1 in production.
+    const result = buildContextWindow(100_000, 110_000)
+    expect(result).not.toBeNull()
+    expect(result!.ratio).toBeGreaterThan(1)
+    expect(result!.ratio).toBeCloseTo(1.1, 10)
+  })
 })
 
 describe("message_complete emit block shape (source-verified, #86)", () => {
@@ -121,6 +130,17 @@ describe("message_complete emit block shape (source-verified, #86)", () => {
     const block = source.slice(idx, idx + 800)
     expect(block).toContain("cost:")
   })
+
+  test("getModel catch rethrows non-ModelNotFoundError (targeted catch, not swallow-all)", async () => {
+    // Verify the catch block only silences ModelNotFoundError; unexpected errors must propagate.
+    // Source check: catch body must reference ModelNotFoundError (not be an empty arrow).
+    const source = await Bun.file(RUN_SRC).text()
+    const getModelIdx = source.indexOf("Provider.getModel(info.providerID")
+    expect(getModelIdx).toBeGreaterThan(-1)
+    const catchWindow = source.slice(getModelIdx, getModelIdx + 400)
+    expect(catchWindow).toContain("ModelNotFoundError")
+    expect(catchWindow).toContain("throw e")
+  })
 })
 
 describe("EVENTS.md documents token breakdown and context (#86)", () => {

From b5babcbdc1f0672fd889961a95c31697cae5ed45 Mon Sep 17 00:00:00 2001
From: Bulat Yapparov <by4pparov@yandex.ru>
Date: Mon, 22 Jun 2026 11:13:31 +0100
Subject: [PATCH 4/4] fix(run): include cache.write in contextUsed for
 context-window utilization

cache.write tokens are part of the prompt sent to the model on the current
turn (they occupy the context window, just billed at the cache-write rate).
Omitting them undercounted utilization most on the first turn, where a large
prefix is written to the cache. Fix: contextUsed = input + cache.read + cache.write.

EVENTS.md: update used definition and example (used: 10848, ratio: 0.05424).
Tests: regression test verifying three-way sum at call site.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0187MsfK1upr6K2BKVbmaebQ
---
 EVENTS.md                                     |  4 +--
 packages/cli/src/cli/cmd/run.ts               | 14 ++++++----
 .../test/cli/usage-token-breakdown.test.ts    | 26 +++++++++++++++----
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/EVENTS.md b/EVENTS.md
index 005b26d..696add3 100644
--- a/EVENTS.md
+++ b/EVENTS.md
@@ -88,7 +88,7 @@ Emitted when an assistant message finishes (one per LLM turn).
     "reasoning": 0,
     "cache": { "read": 8800, "write": 1024 }
   },
-  "context": { "used": 9824, "limit": 200000, "ratio": 0.04912 },
+  "context": { "used": 10848, "limit": 200000, "ratio": 0.05424 },
   "finish": "tool-calls"
 }
 ```
@@ -107,7 +107,7 @@ These fields are non-overlapping: a token is counted in exactly one bucket.
 
 **`context`** (context-window utilization):
 
-- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read`.
+- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read + cache.write`.
 - `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`).
 - `ratio` (number) — `used / limit` (≥0; may exceed 1 if usage exceeds the model's registered limit). A value approaching or exceeding 1 signals context-exhaustion risk.
 - `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint).
diff --git a/packages/cli/src/cli/cmd/run.ts b/packages/cli/src/cli/cmd/run.ts
index 8a6264c..754149a 100644
--- a/packages/cli/src/cli/cmd/run.ts
+++ b/packages/cli/src/cli/cmd/run.ts
@@ -501,17 +501,21 @@ export const RunCommand = cmd({
                   },
                 }
 
-                // Context-window utilization: used = input + cache.read (prompt tokens
-                // that actually hit the model's context window). limit comes from the
-                // model registry (models.dev). On lookup failure (or limit===0 for
-                // custom models) buildContextWindow returns null to signal "unknown".
+                // Context-window utilization: used = input + cache.read + cache.write
+                // (all prompt tokens that occupy the model's context window this turn).
+                // cache.write tokens are written to the cache ON this turn — they are
+                // part of the prompt sent to the model and count against the context
+                // window, just billed at the cache-write rate. Excluding them
+                // undercounts utilization on the first turn of a conversation.
+                // limit comes from the model registry (models.dev). On lookup failure
+                // (or limit===0 for custom models) buildContextWindow returns null.
                 const contextLimit = await Provider.getModel(info.providerID, info.modelID)
                   .then((m) => m.limit.context)
                   .catch((e) => {
                     if (e instanceof Provider.ModelNotFoundError) return null
                     throw e
                   })
-                const contextUsed = tokens.input + tokens.cache.read
+                const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write
                 const context = buildContextWindow(contextLimit, contextUsed)
 
                 emit("message_complete", {
diff --git a/packages/cli/test/cli/usage-token-breakdown.test.ts b/packages/cli/test/cli/usage-token-breakdown.test.ts
index 09a286c..cf68baf 100644
--- a/packages/cli/test/cli/usage-token-breakdown.test.ts
+++ b/packages/cli/test/cli/usage-token-breakdown.test.ts
@@ -25,13 +25,17 @@ describe("buildContextWindow (#86)", () => {
     expect(result).toBeNull()
   })
 
-  test("computes used as the value passed in (caller sets input + cache.read)", () => {
-    const input = 8000
-    const cacheRead = 1824
-    const contextUsed = input + cacheRead
+  test("computes used as the value passed in (caller sets input + cache.read + cache.write)", () => {
+    // Regression: cache.write must be included in contextUsed at the call site (run.ts).
+    // buildContextWindow receives the pre-summed value; this test verifies the helper
+    // honours it (and that the sum is documented correctly: input + cache.read + cache.write).
+    const input = 1024
+    const cacheRead = 8800
+    const cacheWrite = 1024
+    const contextUsed = input + cacheRead + cacheWrite // = 10848
     const result = buildContextWindow(200_000, contextUsed)
     expect(result).not.toBeNull()
-    expect(result!.used).toBe(9824)
+    expect(result!.used).toBe(10848)
   })
 
   test("sets limit to the contextLimit value", () => {
@@ -141,6 +145,18 @@ describe("message_complete emit block shape (source-verified, #86)", () => {
     expect(catchWindow).toContain("ModelNotFoundError")
     expect(catchWindow).toContain("throw e")
   })
+
+  test("contextUsed includes cache.write (regression: must not omit cache.write from context sum)", async () => {
+    // Regression for the bug where `contextUsed = tokens.input + tokens.cache.read`
+    // omitted cache.write, undercounting first-turn utilization. The fix is:
+    //   const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write
+    // This source-text check verifies the three-way sum is present at the call site.
+    const source = await Bun.file(RUN_SRC).text()
+    const contextUsedIdx = source.indexOf("const contextUsed =")
+    expect(contextUsedIdx).toBeGreaterThan(-1)
+    const line = source.slice(contextUsedIdx, contextUsedIdx + 100)
+    expect(line).toContain("cache.write")
+  })
 })
 
 describe("EVENTS.md documents token breakdown and context (#86)", () => {