From 0fe4e59141d3dc71ec716ca67403b0348c88f9f8 Mon Sep 17 00:00:00 2001 From: Truffle Date: Thu, 30 Apr 2026 21:40:15 -0700 Subject: [PATCH 1/9] feat: pass Phantom continuity through Murph context --- chat-ui/src/components/tool-call-card.tsx | 4 +- .../phase-10g-pi-continuity.md | 75 ++++++ .../agent-sdk-boundary-callers.test.ts | 68 +++++- src/agent/__tests__/murph-context.test.ts | 57 +++++ src/agent/__tests__/prompt-assembler.test.ts | 8 + src/agent/agent-sdk.ts | 3 + src/agent/chat-query.ts | 60 +++-- src/agent/murph-context.ts | 72 ++++++ src/agent/prompt-assembler.ts | 20 +- src/agent/prompt-blocks/ui-guidance.ts | 4 + src/agent/runtime.ts | 2 +- src/chat/__tests__/continuity-context.test.ts | 109 +++++++++ src/chat/__tests__/writer.test.ts | 69 +++++- src/chat/continuity-context.ts | 213 ++++++++++++++++++ src/chat/event-log.ts | 13 ++ src/chat/writer.ts | 6 + src/ui/tools.ts | 9 +- 17 files changed, 757 insertions(+), 35 deletions(-) create mode 100644 research/chat-experience/phase-10g-pi-continuity.md create mode 100644 src/agent/__tests__/murph-context.test.ts create mode 100644 src/agent/murph-context.ts create mode 100644 src/chat/__tests__/continuity-context.test.ts create mode 100644 src/chat/continuity-context.ts diff --git a/chat-ui/src/components/tool-call-card.tsx b/chat-ui/src/components/tool-call-card.tsx index 6d381762..e65f8322 100644 --- a/chat-ui/src/components/tool-call-card.tsx +++ b/chat-ui/src/components/tool-call-card.tsx @@ -161,11 +161,11 @@ export function ToolCallCard({ tool }: { tool: ToolCallState }) { const inputDetails = toolInputDetails(tool); const output = tool.output ? redactSensitiveText(truncate(tool.output, TOOL_OUTPUT_DISPLAY_LIMIT)) : ""; - const autoExpand = tool.state === "running" || tool.state === "result" || tool.state === "error" || tool.state === "blocked"; + const autoExpand = tool.state === "error" || tool.state === "blocked"; const [isOpen, setIsOpen] = useState(autoExpand); useEffect(() => { - if (tool.state === "running" || tool.state === "result" || tool.state === "error" || tool.state === "blocked") { + if (tool.state === "error" || tool.state === "blocked") { setIsOpen(true); } }, [tool.state]); diff --git a/research/chat-experience/phase-10g-pi-continuity.md b/research/chat-experience/phase-10g-pi-continuity.md new file mode 100644 index 00000000..75be1083 --- /dev/null +++ b/research/chat-experience/phase-10g-pi-continuity.md @@ -0,0 +1,75 @@ +# Phase 10G Pi Continuity Context + +Date: 2026-04-30 + +## Problem + +A live Phantom-on-Murph browser session showed that Murph compaction can preserve +protocol validity and still lose host-level app facts that the user expects the +agent to remember, such as the exact page URL produced by `phantom_create_page`. +The symptom was not specific to page URLs. It was a continuity issue after a +long, tool-heavy run. + +## Pi Grounding + +Pi already provides the primitive we need: + +- `transformContext` runs at the AgentMessage level before `convertToLlm`. +- Pi custom messages require the app to also provide a `convertToLlm` + implementation. Murph's default Pi converter intentionally passes only + `user`, `assistant`, and `toolResult` messages. +- Phantom should therefore inject host facts through `transformContext` as a + normal user-context message, not as a custom role that the default converter + would filter out. + +Murph already exposes this primitive through `MurphOptions.transformContext`, +passes it through query normalization, and forwards it into the Pi harness. + +## Decision + +Do not build a parallel Phantom continuity runtime. Phantom should derive compact +host facts from its existing durable stream log and pass them to Murph through +`transformContext` as a Pi-compatible user-context message. Murph remains +responsible for raw transcript compaction, replay, tool-call protocol validity, +provider transport, and retry behavior. + +## Current Implementation + +- `src/chat/continuity-context.ts` scans the tail of `chat_stream_events`. +- It extracts user-visible page artifacts from `phantom_create_page` and + `phantom_preview_page`. +- It intentionally excludes `phantom_generate_login` authentication links from + page artifacts. +- It includes recent `session.compact_boundary` checkpoints. +- `src/agent/murph-context.ts` wraps that context in + `` and inserts it as a Pi-compatible user-context + message before the latest user message when possible. +- The chat query path uses this transform only on `agent_runtime: murph`. + Anthropic fallback can still receive the same context through the system + prompt append path. +- Tool call cards now default collapsed, with errors and blocked calls still + opening automatically. + +## Verification + +- Focused Phantom tests pass: + `bun test src/agent/__tests__/murph-context.test.ts src/chat/__tests__/continuity-context.test.ts src/chat/__tests__/writer.test.ts src/agent/__tests__/agent-sdk-boundary-callers.test.ts src/agent/__tests__/prompt-assembler.test.ts` +- Full Phantom tests pass: `bun test`. +- Phantom typecheck passes: `bun run typecheck`. +- Phantom lint passes: `bun run lint`. +- Chat UI typecheck and production build pass. +- Murph shim test and typecheck pass for `Options.transformContext`. + +## Live Verification + +Phantom was run locally on top of the locally rebuilt Murph shim with the OpenAI +provider and `gpt-5.5`. + +Verified: + +- A chat request created and previewed `/ui/continuity-smoke-final.html`. +- The served page returned HTTP 200 and contained the expected smoke text. +- A follow-up asking for the exact created page URL returned the page URL, not + a login link. +- Completed tool cards rendered collapsed by default. An errored tool card still + opened automatically. diff --git a/src/agent/__tests__/agent-sdk-boundary-callers.test.ts b/src/agent/__tests__/agent-sdk-boundary-callers.test.ts index cc42bf84..de86cb13 100644 --- a/src/agent/__tests__/agent-sdk-boundary-callers.test.ts +++ b/src/agent/__tests__/agent-sdk-boundary-callers.test.ts @@ -4,7 +4,13 @@ import { z } from "zod/v4"; import { PhantomConfigSchema } from "../../config/schemas.ts"; import type { PhantomConfig } from "../../config/types.ts"; import { runMigrations } from "../../db/migrate.ts"; -import { type AgentSdkQueryParams, type Query, type SDKMessage, __setAgentSdkQueryForTests } from "../agent-sdk.ts"; +import { + type AgentSdkQueryOptions, + type AgentSdkQueryParams, + type Query, + type SDKMessage, + __setAgentSdkQueryForTests, +} from "../agent-sdk.ts"; import { executeChatQuery } from "../chat-query.ts"; import { CostTracker } from "../cost-tracker.ts"; import { runJudgeQuery } from "../judge-query.ts"; @@ -358,6 +364,54 @@ describe("Agent SDK boundary callers", () => { expect(options?.thinking).toEqual({ type: "enabled", budgetTokens: 8192 }); }); + test("chat query path passes Phantom continuity through Murph transformContext", async () => { + __setAgentSdkQueryForTests((params) => { + calls.push(params); + return queryFromMessages([initMessage(), assistantMessage("chat assistant"), resultMessage("chat result")]); + }); + + await executeChatQuery( + { + config: makeConfig({ + agent_runtime: "murph", + model: "gpt-5.5", + provider: { type: "openai" }, + }), + sessionStore: new SessionStore(db), + costTracker: new CostTracker(db), + memoryContextBuilder: null, + evolvedConfig: null, + roleTemplate: null, + onboardingPrompt: null, + mcpServerFactories: null, + }, + "web:chat-session", + { role: "user", content: "give me the page link" }, + Date.now(), + { + signal: new AbortController().signal, + sessionContext: "User-visible page: http://127.0.0.1:3112/ui/profile.html", + onSdkEvent: () => {}, + }, + ); + const options = calls[0]?.options as AgentSdkQueryOptions | undefined; + const transformContext = options?.transformContext; + expect(transformContext).toBeDefined(); + const systemPrompt = calls[0]?.options?.systemPrompt; + if (typeof systemPrompt === "object" && systemPrompt !== null && "append" in systemPrompt) { + expect(systemPrompt.append).not.toContain("User-visible page"); + } else { + throw new Error("Expected object system prompt"); + } + + const transformed = (await transformContext?.([{ role: "user", content: "same prompt" }])) ?? []; + expect(transformed).toHaveLength(2); + const contextMessage = transformed[0] as Record; + expect(contextMessage.role).toBe("user"); + expect(textFromContent(contextMessage.content)).toContain(""); + expect(textFromContent(contextMessage.content)).toContain("http://127.0.0.1:3112/ui/profile.html"); + }); + test("chat query retries stale resume result frames without forwarding the error result", async () => { const sdkEvents: SDKMessage[] = []; let factoryCalls = 0; @@ -474,3 +528,15 @@ describe("Agent SDK boundary callers", () => { expect(options?.env?.OPENAI_API_KEY).toBe("openai-secret"); }); }); + +function textFromContent(content: unknown): string { + if (typeof content === "string") return content; + if (!Array.isArray(content)) return ""; + return content + .map((item) => { + if (item === null || typeof item !== "object" || Array.isArray(item)) return ""; + const block = item as Record; + return block.type === "text" && typeof block.text === "string" ? block.text : ""; + }) + .join("\n"); +} diff --git a/src/agent/__tests__/murph-context.test.ts b/src/agent/__tests__/murph-context.test.ts new file mode 100644 index 00000000..87c19c2a --- /dev/null +++ b/src/agent/__tests__/murph-context.test.ts @@ -0,0 +1,57 @@ +import { describe, expect, test } from "bun:test"; +import { createMurphContextTransform } from "../murph-context.ts"; + +describe("createMurphContextTransform", () => { + test("injects Phantom context as a Pi-compatible user message before the latest user message", async () => { + const transform = createMurphContextTransform("User-visible page: http://127.0.0.1:3100/ui/profile.html"); + expect(transform).toBeDefined(); + + const userMessage = { role: "user", content: [{ type: "text", text: "Give me the link." }] }; + const output = await transform?.([{ role: "assistant", content: [] }, userMessage]); + + expect(output).toHaveLength(3); + expect(record(output?.[1])?.role).toBe("user"); + expect(textContent(output?.[1])).toContain(""); + expect(textContent(output?.[1])).toContain("http://127.0.0.1:3100/ui/profile.html"); + expect(output?.[2]).toBe(userMessage); + }); + + test("replaces stale Phantom context messages instead of accumulating them", async () => { + const transform = createMurphContextTransform("Fresh context"); + const staleContext = { + role: "user", + content: [{ type: "text", text: "\nStale context\n" }], + timestamp: 1, + }; + + const output = + (await transform?.([{ role: "assistant", content: [] }, staleContext, { role: "toolResult", content: [] }])) ?? + []; + + const phantomContexts = output.filter((message) => textContent(message).includes("")); + expect(phantomContexts).toHaveLength(1); + expect(textContent(phantomContexts[0])).toContain("Fresh context"); + expect(output).not.toContain(staleContext); + }); + + test("returns undefined for empty context", () => { + expect(createMurphContextTransform(" ")).toBeUndefined(); + expect(createMurphContextTransform(undefined)).toBeUndefined(); + }); +}); + +function record(value: unknown): Record | undefined { + return value !== null && typeof value === "object" ? (value as Record) : undefined; +} + +function textContent(value: unknown): string { + const content = record(value)?.content; + if (typeof content === "string") return content; + if (!Array.isArray(content)) return ""; + return content + .map((item) => { + const block = record(item); + return block?.type === "text" && typeof block.text === "string" ? block.text : ""; + }) + .join("\n"); +} diff --git a/src/agent/__tests__/prompt-assembler.test.ts b/src/agent/__tests__/prompt-assembler.test.ts index 502ccff6..9378896f 100644 --- a/src/agent/__tests__/prompt-assembler.test.ts +++ b/src/agent/__tests__/prompt-assembler.test.ts @@ -161,4 +161,12 @@ describe("assemblePrompt UI vocabulary guidance", () => { const prompt = assemblePrompt(baseConfig); expect(prompt).toContain("public/_examples/"); }); + + test("distinguishes created page URLs from authentication links", () => { + const prompt = assemblePrompt(baseConfig); + expect(prompt).toContain("Page URLs and login URLs are different."); + expect(prompt).toContain("return the exact /ui/ page URL"); + expect(prompt).toContain("Only call phantom_generate_login"); + expect(prompt).toContain("Do not substitute"); + }); }); diff --git a/src/agent/agent-sdk.ts b/src/agent/agent-sdk.ts index b65410da..60b1dd33 100644 --- a/src/agent/agent-sdk.ts +++ b/src/agent/agent-sdk.ts @@ -35,6 +35,9 @@ export type { }; export type AgentSdkQueryParams = Parameters[0]; +export type AgentSdkQueryOptions = NonNullable & { + transformContext?: (messages: unknown[], signal?: AbortSignal) => Promise | unknown[]; +}; export type AgentSdkQuery = (params: AgentSdkQueryParams) => Query; export type AgentSdkRuntimeSelection = { agentRuntime: AgentRuntimeKind; diff --git a/src/agent/chat-query.ts b/src/agent/chat-query.ts index 3710b2e4..4bea2b58 100644 --- a/src/agent/chat-query.ts +++ b/src/agent/chat-query.ts @@ -1,7 +1,13 @@ // Extracted chat-specific query logic for the runForChat method. // Lives outside runtime.ts to keep that file under the 300-line budget. -import { type McpServerConfig, type SDKMessage, type SDKUserMessage, query } from "./agent-sdk.ts"; +import { + type AgentSdkQueryOptions, + type McpServerConfig, + type SDKMessage, + type SDKUserMessage, + query, +} from "./agent-sdk.ts"; type MessageParam = SDKUserMessage["message"]; import { buildAgentRuntimeEnv, resolveAgentRuntimeModel } from "../config/providers.ts"; @@ -14,6 +20,7 @@ import { type AgentCost, type AgentResponse, emptyCost } from "./events.ts"; import { createDangerousCommandBlocker, createFileTracker } from "./hooks.ts"; import { extractTextFromMessageParam } from "./message-param-utils.ts"; import { extractCost, extractTextFromMessage } from "./message-utils.ts"; +import { createMurphContextTransform } from "./murph-context.ts"; import { permissionOptionsFromConfig } from "./permission-options.ts"; import { assemblePrompt } from "./prompt-assembler.ts"; import { isNoConversationFoundResult, sdkResultErrorText } from "./sdk-result-errors.ts"; @@ -36,7 +43,7 @@ export async function executeChatQuery( sessionKey: string, message: MessageParam, startTime: number, - options: { signal: AbortSignal; onSdkEvent: (msg: SDKMessage) => void }, + options: { signal: AbortSignal; onSdkEvent: (msg: SDKMessage) => void; sessionContext?: string }, ): Promise { const parts = sessionKey.split(":"); const channelId = parts[0] ?? "web"; @@ -55,6 +62,7 @@ export async function executeChatQuery( /* Memory unavailable */ } } + const useMurphContextTransform = deps.config.agent_runtime === "murph"; const appendPrompt = assemblePrompt( deps.config, memoryContext, @@ -62,7 +70,9 @@ export async function executeChatQuery( deps.roleTemplate ?? undefined, deps.onboardingPrompt ?? undefined, undefined, + useMurphContextTransform ? undefined : options.sessionContext, ); + const transformContext = useMurphContextTransform ? createMurphContextTransform(options.sessionContext) : undefined; const queryModel = resolveAgentRuntimeModel(deps.config, deps.config.model); const providerEnv = buildAgentRuntimeEnv(deps.config, queryModel); @@ -93,30 +103,32 @@ export async function executeChatQuery( await Promise.all(Object.entries(deps.mcpServerFactories).map(async ([k, f]) => [k, await f()] as const)), ) : undefined; + const queryOptions: AgentSdkQueryOptions = { + model: queryModel, + ...permissionOptions, + settingSources: ["project", "user"], + systemPrompt: { + type: "preset" as const, + preset: "claude_code" as const, + append: appendPrompt, + }, + persistSession: true, + effort: deps.config.effort, + thinking: getThinkingConfig(queryModel), + includePartialMessages: true, + agentProgressSummaries: true, + promptSuggestions: true, + ...(deps.config.max_budget_usd > 0 ? { maxBudgetUsd: deps.config.max_budget_usd } : {}), + abortController: controller, + env: { ...process.env, ...providerEnv }, + hooks: { PreToolUse: [commandBlocker], PostToolUse: [fileTracker.hook] }, + ...(useResume && session?.sdk_session_id ? { resume: session.sdk_session_id } : {}), + ...(mcpServers ? { mcpServers } : {}), + ...(transformContext ? { transformContext } : {}), + }; const queryStream = query({ prompt: makePrompt(), - options: { - model: queryModel, - ...permissionOptions, - settingSources: ["project", "user"], - systemPrompt: { - type: "preset" as const, - preset: "claude_code" as const, - append: appendPrompt, - }, - persistSession: true, - effort: deps.config.effort, - thinking: getThinkingConfig(queryModel), - includePartialMessages: true, - agentProgressSummaries: true, - promptSuggestions: true, - ...(deps.config.max_budget_usd > 0 ? { maxBudgetUsd: deps.config.max_budget_usd } : {}), - abortController: controller, - env: { ...process.env, ...providerEnv }, - hooks: { PreToolUse: [commandBlocker], PostToolUse: [fileTracker.hook] }, - ...(useResume && session?.sdk_session_id ? { resume: session.sdk_session_id } : {}), - ...(mcpServers ? { mcpServers } : {}), - }, + options: queryOptions, }); for await (const msg of queryStream) { diff --git a/src/agent/murph-context.ts b/src/agent/murph-context.ts new file mode 100644 index 00000000..231086eb --- /dev/null +++ b/src/agent/murph-context.ts @@ -0,0 +1,72 @@ +export type MurphContextTransform = (messages: unknown[], signal?: AbortSignal) => Promise | unknown[]; + +const PHANTOM_CONTEXT_OPEN_TAG = ""; +const PHANTOM_CONTEXT_CLOSE_TAG = ""; + +type PhantomContextMessage = { + role: "user"; + content: [{ type: "text"; text: string }]; + timestamp: number; +}; + +export function createMurphContextTransform(context: string | undefined): MurphContextTransform | undefined { + const trimmed = context?.trim(); + if (!trimmed) return undefined; + + return (messages: unknown[]) => { + const cleaned = messages.filter((message) => !isPhantomContextMessage(message)); + const contextMessage = buildContextMessage(trimmed); + if (cleaned.length === 0) { + return [contextMessage]; + } + + const lastIndex = cleaned.length - 1; + const lastMessage = cleaned[lastIndex]; + if (hasRole(lastMessage, "user")) { + return [...cleaned.slice(0, lastIndex), contextMessage, lastMessage]; + } + + return [...cleaned, contextMessage]; + }; +} + +function buildContextMessage(content: string): PhantomContextMessage { + return { + role: "user", + content: [ + { + type: "text", + text: [ + PHANTOM_CONTEXT_OPEN_TAG, + "Durable context supplied by Phantom outside the raw transcript.", + "Use it to continue after Murph compaction without asking the user to repeat known app state.", + content, + PHANTOM_CONTEXT_CLOSE_TAG, + ].join("\n"), + }, + ], + timestamp: Date.now(), + }; +} + +function isPhantomContextMessage(message: unknown): boolean { + if (!isRecord(message) || message.role !== "user") return false; + const content = message.content; + if (typeof content === "string") return content.includes(PHANTOM_CONTEXT_OPEN_TAG); + if (!Array.isArray(content)) return false; + return content.some( + (item) => isRecord(item) && item.type === "text" && textField(item).includes(PHANTOM_CONTEXT_OPEN_TAG), + ); +} + +function hasRole(message: unknown, role: string): boolean { + return isRecord(message) && message.role === role; +} + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === "object"; +} + +function textField(record: Record): string { + return typeof record.text === "string" ? record.text : ""; +} diff --git a/src/agent/prompt-assembler.ts b/src/agent/prompt-assembler.ts index 1c134309..3f8ccc3f 100644 --- a/src/agent/prompt-assembler.ts +++ b/src/agent/prompt-assembler.ts @@ -18,6 +18,7 @@ export function assemblePrompt( roleTemplate?: RoleTemplate, onboardingPrompt?: string, dataDir?: string, + chatRuntimeContext?: string, ): string { const sections: string[] = []; @@ -74,6 +75,10 @@ export function assemblePrompt( sections.push(buildMemorySection(memoryContext)); } + if (chatRuntimeContext) { + sections.push(buildChatRuntimeContext(chatRuntimeContext)); + } + return sections.join("\n\n"); } @@ -140,9 +145,14 @@ function buildEnvironment(config: PhantomConfig): string { lines.push(""); lines.push("Schedule types: one-shot (at), interval (every N ms), cron (weekdays at 9am)."); lines.push(""); - lines.push("To give a user access to a /ui/ page, call phantom_generate_login to create a magic link"); - lines.push("and send the link to them via Slack. The link must be sent as plain text without any"); - lines.push("Markdown wrapping (no asterisks, no bold, no parentheses) so Slack renders it cleanly."); + lines.push("Page URLs and login URLs are different."); + lines.push("When the user asks for the page, link, profile, report, dashboard, or thing you created,"); + lines.push("return the exact /ui/ page URL from phantom_create_page or phantom_preview_page."); + lines.push("Only call phantom_generate_login when the user explicitly asks for access, auth,"); + lines.push("a login link, a magic link, or says they cannot open a page because login is required."); + lines.push("If you share a login link, label it as an authentication link. Do not substitute"); + lines.push("a login link for a created page URL."); + lines.push("Links must be sent as plain text without Markdown wrapping so Slack renders them cleanly."); lines.push(""); lines.push(...buildUIGuidanceLines(publicUrl ?? undefined)); lines.push(""); @@ -231,6 +241,10 @@ function buildMemorySection(memoryContext: string): string { return `# Your Memory\n\nPersistent memory from previous sessions. Use this to maintain continuity.\n\n${memoryContext}`; } +function buildChatRuntimeContext(chatRuntimeContext: string): string { + return `# Current Chat Context\n\n${chatRuntimeContext}`; +} + function buildFallbackRoleHint(config: PhantomConfig): string { return `Your role is ${config.role}. Approach every task with that expertise.`; } diff --git a/src/agent/prompt-blocks/ui-guidance.ts b/src/agent/prompt-blocks/ui-guidance.ts index 887d1a22..4eb8c5c1 100644 --- a/src/agent/prompt-blocks/ui-guidance.ts +++ b/src/agent/prompt-blocks/ui-guidance.ts @@ -147,6 +147,10 @@ export function buildUIGuidanceLines(publicUrl: string | undefined): string[] { lines.push("phantom_preview_page with the same path. Review the screenshot. Read the"); lines.push("JSON metadata block. If console.errors > 0 or network.failedRequests > 0,"); lines.push("fix the HTML and re-preview until both are zero. Only then report the page."); + lines.push("Preserve the exact page URL returned by phantom_create_page or preview metadata."); + lines.push("When the user later asks for the page or link you created, return that page URL."); + lines.push("Do not answer a page-link request by calling phantom_generate_login unless"); + lines.push("the user explicitly asks for an authentication link."); lines.push(""); if (publicUrl) { lines.push(`Pages are at ${publicUrl}/ui/`); diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts index 43aaf1e6..64917696 100644 --- a/src/agent/runtime.ts +++ b/src/agent/runtime.ts @@ -126,7 +126,7 @@ export class AgentRuntime { async runForChat( sessionKey: string, message: MessageParam, - options: { signal: AbortSignal; onSdkEvent: (msg: SDKMessage) => void }, + options: { signal: AbortSignal; onSdkEvent: (msg: SDKMessage) => void; sessionContext?: string }, ): Promise { if (this.activeSessions.has(sessionKey)) { return { text: "Error: session busy", sessionId: "", cost: emptyCost(), durationMs: 0 }; diff --git a/src/chat/__tests__/continuity-context.test.ts b/src/chat/__tests__/continuity-context.test.ts new file mode 100644 index 00000000..dbbe9850 --- /dev/null +++ b/src/chat/__tests__/continuity-context.test.ts @@ -0,0 +1,109 @@ +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { MIGRATIONS } from "../../db/schema.ts"; +import { buildChatContinuityContext } from "../continuity-context.ts"; +import { ChatEventLog } from "../event-log.ts"; +import { ChatSessionStore } from "../session-store.ts"; + +let db: Database; +let eventLog: ChatEventLog; +let sessionStore: ChatSessionStore; + +beforeEach(() => { + db = new Database(":memory:"); + for (const sql of MIGRATIONS) { + db.run(sql); + } + eventLog = new ChatEventLog(db); + sessionStore = new ChatSessionStore(db); +}); + +afterEach(() => { + db.close(); +}); + +describe("buildChatContinuityContext", () => { + test("summarizes created page artifacts from the durable stream log", () => { + const session = sessionStore.create(); + eventLog.append(session.id, null, 1, "message.tool_call_start", { + event: "message.tool_call_start", + tool_call_id: "tool-1", + tool_name: "phantom_create_page", + message_id: "assistant-1", + parent_tool_use_id: null, + is_mcp: true, + }); + eventLog.append(session.id, null, 2, "message.tool_call_input_end", { + event: "message.tool_call_input_end", + tool_call_id: "tool-1", + input: { + path: "muhammad-ahmed-cheema.html", + title: "Muhammad Ahmed Cheema Profile", + }, + }); + eventLog.append(session.id, null, 3, "message.tool_call_result", { + event: "message.tool_call_result", + tool_call_id: "tool-1", + tool_name: "phantom_create_page", + status: "success", + output: JSON.stringify({ + path: "muhammad-ahmed-cheema.html", + url: "http://127.0.0.1:3112/ui/muhammad-ahmed-cheema.html", + size: 12345, + }), + }); + + const context = buildChatContinuityContext({ sessionId: session.id, eventLog }); + + expect(context).toContain("User-visible page artifacts"); + expect(context).toContain("Muhammad Ahmed Cheema Profile"); + expect(context).toContain("http://127.0.0.1:3112/ui/muhammad-ahmed-cheema.html"); + expect(context).toContain("muhammad-ahmed-cheema.html"); + expect(context).not.toContain("/ui/login"); + }); + + test("skips login links and keeps recent compact checkpoints", () => { + const session = sessionStore.create(); + eventLog.append(session.id, null, 1, "session.compact_boundary", { + event: "session.compact_boundary", + trigger: "auto", + pre_tokens: 1434337, + }); + eventLog.append(session.id, null, 2, "message.tool_call_result", { + event: "message.tool_call_result", + tool_call_id: "tool-login", + tool_name: "phantom_generate_login", + status: "success", + output: JSON.stringify({ + magicLink: "http://127.0.0.1:3112/ui/login?magic=secret", + }), + }); + + const context = buildChatContinuityContext({ sessionId: session.id, eventLog }); + + expect(context).toContain("auto compaction at stream seq 1 before about 1,434,337 tokens."); + expect(context).toContain("Authentication links"); + expect(context).not.toContain("magic=secret"); + }); + + test("uses the latest stream events when the full event log is larger than the scan limit", () => { + const session = sessionStore.create(); + for (let seq = 1; seq <= 12; seq++) { + eventLog.append(session.id, null, seq, "session.status", { + event: "session.status", + status: "working", + permission_mode: "bypassPermissions", + }); + } + eventLog.append(session.id, null, 13, "session.compact_boundary", { + event: "session.compact_boundary", + trigger: "auto", + pre_tokens: 500000, + }); + + const context = buildChatContinuityContext({ sessionId: session.id, eventLog, limit: 3 }); + + expect(context).toContain("stream seq 13"); + expect(context).toContain("500,000"); + }); +}); diff --git a/src/chat/__tests__/writer.test.ts b/src/chat/__tests__/writer.test.ts index ed6167fb..68beaaea 100644 --- a/src/chat/__tests__/writer.test.ts +++ b/src/chat/__tests__/writer.test.ts @@ -36,7 +36,7 @@ function mockRuntime(overrides?: { runForChat?: ( key: string, msg: unknown, - opts: { signal: AbortSignal; onSdkEvent: (msg: unknown) => void }, + opts: { signal: AbortSignal; onSdkEvent: (msg: unknown) => void; sessionContext?: string }, ) => Promise<{ text: string; sessionId: string; @@ -305,6 +305,73 @@ describe("ChatSessionWriter", () => { expect(timelines[0]?.summary.status).toBe("completed"); }); + test("passes durable page context into the chat runtime", async () => { + const session = sessionStore.create(); + let capturedContext: string | undefined; + eventLog.append(session.id, null, 1, "message.tool_call_start", { + event: "message.tool_call_start", + tool_call_id: "tool-1", + tool_name: "phantom_create_page", + message_id: "assistant-1", + parent_tool_use_id: null, + is_mcp: true, + }); + eventLog.append(session.id, null, 2, "message.tool_call_input_end", { + event: "message.tool_call_input_end", + tool_call_id: "tool-1", + input: { + path: "profile.html", + title: "Profile Page", + }, + }); + eventLog.append(session.id, null, 3, "message.tool_call_result", { + event: "message.tool_call_result", + tool_call_id: "tool-1", + tool_name: "phantom_create_page", + status: "success", + output: JSON.stringify({ + path: "profile.html", + url: "http://127.0.0.1:3112/ui/profile.html", + }), + }); + + const writer = new ChatSessionWriter({ + sessionId: session.id, + runtime: mockRuntime({ + runForChat: async (_key, _message, opts) => { + capturedContext = opts.sessionContext; + opts.onSdkEvent({ + type: "result", + subtype: "success", + result: "ok", + stop_reason: "end_turn", + total_cost_usd: 0, + usage: {}, + modelUsage: {}, + duration_ms: 0, + num_turns: 1, + }); + return { + text: "ok", + sessionId: "sdk-1", + cost: { totalUsd: 0, inputTokens: 0, outputTokens: 0, modelUsage: {} }, + durationMs: 0, + }; + }, + }), + eventLog, + messageStore, + sessionStore, + streamBus, + }); + writer.claim(); + + await writer.run({ role: "user", content: "give me the page link" }, "t1", "give me the page link"); + + expect(capturedContext).toContain("Profile Page"); + expect(capturedContext).toContain("http://127.0.0.1:3112/ui/profile.html"); + }); + test("persists errored run timeline without committing assistant id", async () => { const session = sessionStore.create(); const writer = new ChatSessionWriter({ diff --git a/src/chat/continuity-context.ts b/src/chat/continuity-context.ts new file mode 100644 index 00000000..19e51949 --- /dev/null +++ b/src/chat/continuity-context.ts @@ -0,0 +1,213 @@ +import type { ChatEventLog, ChatStreamEvent } from "./event-log.ts"; + +const DEFAULT_EVENT_SCAN_LIMIT = 5000; +const MAX_ARTIFACTS = 8; +const MAX_COMPACTIONS = 3; +const MAX_LABEL_LENGTH = 90; +const PAGE_TOOLS = new Set(["phantom_create_page", "phantom_preview_page"]); + +type BuildChatContinuityContextInput = { + sessionId: string; + eventLog: ChatEventLog; + limit?: number; +}; + +type ToolAccumulator = { + seq: number; + toolName?: string; + input?: unknown; + output?: string; + status?: string; +}; + +type PageArtifact = { + seq: number; + toolName: string; + label: string; + url?: string; + path?: string; + size?: number; +}; + +type CompactCheckpoint = { + seq: number; + trigger?: string; + preTokens?: number; +}; + +export function buildChatContinuityContext(input: BuildChatContinuityContextInput): string | undefined { + const events = input.eventLog.tail(input.sessionId, input.limit ?? DEFAULT_EVENT_SCAN_LIMIT); + const tools = new Map(); + const compactions: CompactCheckpoint[] = []; + + for (const event of events) { + const payload = parsePayload(event); + if (!payload) continue; + const eventType = stringField(payload, "event") ?? event.event_type; + + if (eventType === "session.compact_boundary") { + compactions.push({ + seq: event.seq, + trigger: stringField(payload, "trigger"), + preTokens: numberField(payload, "pre_tokens"), + }); + continue; + } + + if (!eventType.startsWith("message.tool_call_")) continue; + const toolCallId = stringField(payload, "tool_call_id"); + if (!toolCallId) continue; + const tool = tools.get(toolCallId) ?? { seq: event.seq }; + tool.seq = event.seq; + + const toolName = stringField(payload, "tool_name"); + if (toolName) tool.toolName = toolName; + + if (eventType === "message.tool_call_input_end") { + tool.input = payload.input; + } else if (eventType === "message.tool_call_running") { + const outputPreview = stringField(payload, "output_preview"); + if (outputPreview && !tool.output) tool.output = outputPreview; + } else if (eventType === "message.tool_call_result") { + tool.status = stringField(payload, "status"); + tool.output = stringField(payload, "output") ?? stringField(payload, "output_preview") ?? tool.output; + } + + tools.set(toolCallId, tool); + } + + const artifacts = dedupeArtifacts([...tools.values()].flatMap((tool) => artifactFromTool(tool) ?? [])); + const latestCompactions = compactions.slice(-MAX_COMPACTIONS); + if (artifacts.length === 0 && latestCompactions.length === 0) { + return undefined; + } + + return renderContext({ + artifacts: artifacts.slice(-MAX_ARTIFACTS), + compactions: latestCompactions, + }); +} + +function renderContext(input: { artifacts: PageArtifact[]; compactions: CompactCheckpoint[] }): string { + const lines = [ + "Durable Phantom chat context:", + "- The transcript may have been compacted by Murph. Continue from the latest user message using these host facts when relevant.", + "- Authentication links from phantom_generate_login are not page artifacts.", + ]; + + if (input.compactions.length > 0) { + lines.push("", "Recent compaction checkpoints:"); + for (const checkpoint of input.compactions) { + const trigger = checkpoint.trigger ?? "unknown"; + const tokens = + checkpoint.preTokens === undefined + ? "" + : ` before about ${checkpoint.preTokens.toLocaleString("en-US")} tokens`; + lines.push(`- ${trigger} compaction at stream seq ${checkpoint.seq}${tokens}.`); + } + } + + if (input.artifacts.length > 0) { + lines.push("", "User-visible page artifacts from earlier tool work:"); + for (const artifact of input.artifacts) { + const parts = [`- ${artifact.label}`]; + if (artifact.url) parts.push(` URL: ${artifact.url}`); + if (artifact.path) parts.push(` path: ${artifact.path}`); + if (artifact.size !== undefined) parts.push(` size: ${artifact.size} bytes`); + parts.push(` via ${artifact.toolName} at stream seq ${artifact.seq}.`); + lines.push(parts.join(";")); + } + } + + return lines.join("\n"); +} + +function artifactFromTool(tool: ToolAccumulator): PageArtifact | undefined { + if (!tool.toolName || !PAGE_TOOLS.has(tool.toolName)) return undefined; + + const input = recordFromUnknown(tool.input); + const output = parseJsonRecord(tool.output); + const path = stringField(output, "path") ?? stringField(input, "path"); + const url = normalizePageUrl( + stringField(output, "url") ?? + stringField(output, "publicUrl") ?? + stringField(output, "pageUrl") ?? + urlFromText(tool.output), + ); + if (!url && !path) return undefined; + + const title = stringField(input, "title") ?? stringField(output, "title") ?? path ?? url ?? "Created page"; + const size = numberField(output, "size"); + return { + seq: tool.seq, + toolName: tool.toolName, + label: truncate(title, MAX_LABEL_LENGTH), + ...(url ? { url } : {}), + ...(path ? { path } : {}), + ...(size !== undefined ? { size } : {}), + }; +} + +function dedupeArtifacts(artifacts: PageArtifact[]): PageArtifact[] { + const byKey = new Map(); + for (const artifact of artifacts) { + const key = artifact.url ?? artifact.path ?? `${artifact.toolName}:${artifact.seq}`; + byKey.set(key, artifact); + } + return [...byKey.values()].sort((left, right) => left.seq - right.seq); +} + +function parsePayload(event: ChatStreamEvent): Record | undefined { + try { + const parsed = JSON.parse(event.payload_json); + return recordFromUnknown(parsed); + } catch { + return undefined; + } +} + +function parseJsonRecord(value: string | undefined): Record | undefined { + if (!value) return undefined; + try { + return recordFromUnknown(JSON.parse(value)); + } catch { + return undefined; + } +} + +function recordFromUnknown(value: unknown): Record | undefined { + if (value === null || typeof value !== "object" || Array.isArray(value)) { + return undefined; + } + return value as Record; +} + +function stringField(record: Record | undefined, key: string): string | undefined { + const value = record?.[key]; + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function numberField(record: Record | undefined, key: string): number | undefined { + const value = record?.[key]; + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function normalizePageUrl(url: string | undefined): string | undefined { + if (!url || !url.includes("/ui/") || url.includes("/ui/login")) { + return undefined; + } + return url; +} + +function urlFromText(text: string | undefined): string | undefined { + if (!text) return undefined; + const match = text.match(/https?:\/\/[^\s"']+\/ui\/[^\s"']+/); + return normalizePageUrl(match?.[0]); +} + +function truncate(value: string, maxLength: number): string { + if (value.length <= maxLength) return value; + return `${value.slice(0, maxLength - 3)}...`; +} diff --git a/src/chat/event-log.ts b/src/chat/event-log.ts index ecccb8e8..f8b89242 100644 --- a/src/chat/event-log.ts +++ b/src/chat/event-log.ts @@ -45,6 +45,19 @@ export class ChatEventLog { .all(sessionId, afterSeq, maxRows) as ChatStreamEvent[]; } + tail(sessionId: string, limit?: number): ChatStreamEvent[] { + const maxRows = limit ?? 5000; + const rows = this.db + .query( + `SELECT * FROM chat_stream_events + WHERE session_id = ? + ORDER BY seq DESC + LIMIT ?`, + ) + .all(sessionId, maxRows) as ChatStreamEvent[]; + return rows.reverse(); + } + getMaxSeq(sessionId: string): number { const row = this.db .query("SELECT MAX(seq) as max_seq FROM chat_stream_events WHERE session_id = ?") diff --git a/src/chat/writer.ts b/src/chat/writer.ts index dc579852..e87906a9 100644 --- a/src/chat/writer.ts +++ b/src/chat/writer.ts @@ -3,6 +3,7 @@ import type { SDKUserMessage } from "../agent/agent-sdk.ts"; type MessageParam = SDKUserMessage["message"]; import type { AgentRuntime } from "../agent/runtime.ts"; import { autoRenameSession } from "./auto-rename.ts"; +import { buildChatContinuityContext } from "./continuity-context.ts"; import type { ChatEventLog } from "./event-log.ts"; import type { ChatMessageStore } from "./message-store.ts"; import type { NotificationTriggerService } from "./notifications/triggers.ts"; @@ -99,8 +100,13 @@ export class ChatSessionWriter { let resultText = ""; try { + const sessionContext = buildChatContinuityContext({ + sessionId: this.deps.sessionId, + eventLog: this.deps.eventLog, + }); const response = await this.deps.runtime.runForChat(sessionKey, message, { signal: this.abortController.signal, + sessionContext, onSdkEvent: (sdkMsg: unknown) => { const frames = translateSdkMessage(sdkMsg as Record, ctx); for (const frame of frames) { diff --git a/src/ui/tools.ts b/src/ui/tools.ts index 507f0da0..56934335 100644 --- a/src/ui/tools.ts +++ b/src/ui/tools.ts @@ -27,7 +27,7 @@ export function createWebUiToolServer( "phantom_create_page", "Create or update an HTML page served at /ui/. If html is provided, writes it directly. " + "If title and content are provided instead, wraps the content in the base template. " + - "Returns the public URL of the page.", + "Returns the page URL to share when the user asks for the page you created.", { path: z.string().min(1).describe("File path relative to public/, e.g. 'dashboard.html' or 'reports/weekly.html'"), html: z.string().optional().describe("Full HTML content to write (use this for complete pages)"), @@ -77,6 +77,7 @@ export function createWebUiToolServer( path: safePath, url: publicUrl, size: htmlContent.length, + note: "This is the created page URL, not a login link.", }); } catch (error: unknown) { const msg = error instanceof Error ? error.message : String(error); @@ -87,7 +88,9 @@ export function createWebUiToolServer( const generateLoginTool = tool( "phantom_generate_login", - "Generate a magic link for web UI authentication. Send this link to the user via Slack. " + + "Generate a magic link for web UI authentication. Use only when the user asks for access, " + + "auth, login, a magic link, or says they cannot open a page because login is required. " + + "Do not use this to answer a request for the page URL of something you created. " + "The link expires in 10 minutes. After authentication, the session lasts 7 days.", {}, async () => { @@ -100,7 +103,7 @@ export function createWebUiToolServer( // sessionToken intentionally excluded - agent should only share the magic link expiresIn: "10 minutes", sessionDuration: "7 days", - note: "Send the magic link to the user via Slack. They click it and are authenticated instantly.", + note: "This is an authentication link, not a page URL. Send it only when login access is needed.", }); } catch (error: unknown) { const msg = error instanceof Error ? error.message : String(error); From 593ade461566a6e3852beff9be44525b78ac3537 Mon Sep 17 00:00:00 2001 From: Truffle Date: Thu, 30 Apr 2026 22:48:20 -0700 Subject: [PATCH 2/9] fix chat transcript integrity --- chat-ui/src/components/assistant-message.tsx | 11 +- chat-ui/src/components/chat-input.tsx | 27 +- chat-ui/src/components/user-message.tsx | 55 +- chat-ui/src/hooks/use-attachments.ts | 70 ++- chat-ui/src/hooks/use-chat.ts | 22 +- chat-ui/src/lib/__tests__/chat-store.test.ts | 130 ++++ chat-ui/src/lib/chat-message-content.ts | 75 +++ chat-ui/src/lib/chat-store.ts | 51 +- chat-ui/src/lib/chat-types.ts | 9 + chat-ui/src/lib/client.ts | 13 +- chat-ui/src/routes/session-route.tsx | 22 +- prompts/phase-10h-chat-integrity-builder.md | 117 ++++ prompts/phase-10h-chat-integrity-review.md | 57 ++ prompts/phase-10h-phantom-chat-review.md | 59 ++ prompts/phase-10h-pi-thinking-research.md | 68 ++ .../phase-10h-provider-thinking-research.md | 72 +++ .../phase-10h-chat-integrity-review.md | 50 ++ .../phase-10h-live-verification.md | 66 ++ .../phase-10h-phantom-chat-review.md | 150 +++++ .../phase-10h-pi-thinking-research.md | 590 ++++++++++++++++++ .../phase-10h-product-direction.md | 79 +++ .../phase-10h-provider-thinking-research.md | 353 +++++++++++ src/chat/__tests__/http.test.ts | 71 +++ src/chat/__tests__/message-builder.test.ts | 7 +- src/chat/__tests__/sdk-to-wire.test.ts | 25 + src/chat/__tests__/writer.test.ts | 108 ++++ src/chat/http-handlers.ts | 45 +- src/chat/message-builder.ts | 97 ++- src/chat/sdk-to-wire.ts | 2 +- src/chat/types.ts | 8 +- src/chat/upload.ts | 43 +- src/chat/writer.ts | 44 +- 32 files changed, 2493 insertions(+), 103 deletions(-) create mode 100644 chat-ui/src/lib/chat-message-content.ts create mode 100644 prompts/phase-10h-chat-integrity-builder.md create mode 100644 prompts/phase-10h-chat-integrity-review.md create mode 100644 prompts/phase-10h-phantom-chat-review.md create mode 100644 prompts/phase-10h-pi-thinking-research.md create mode 100644 prompts/phase-10h-provider-thinking-research.md create mode 100644 research/chat-experience/phase-10h-chat-integrity-review.md create mode 100644 research/chat-experience/phase-10h-live-verification.md create mode 100644 research/chat-experience/phase-10h-phantom-chat-review.md create mode 100644 research/chat-experience/phase-10h-pi-thinking-research.md create mode 100644 research/chat-experience/phase-10h-product-direction.md create mode 100644 research/chat-experience/phase-10h-provider-thinking-research.md diff --git a/chat-ui/src/components/assistant-message.tsx b/chat-ui/src/components/assistant-message.tsx index fc4df8b2..0b5be3bb 100644 --- a/chat-ui/src/components/assistant-message.tsx +++ b/chat-ui/src/components/assistant-message.tsx @@ -1,4 +1,5 @@ import type { ChatMessage, ThinkingBlockState, ToolCallState } from "@/lib/chat-types"; +import { getAssistantTextBlocks } from "@/lib/chat-message-content"; import { Markdown } from "./markdown"; import { ThinkingBlock } from "./thinking-block"; import { ToolCallCard } from "./tool-call-card"; @@ -12,8 +13,8 @@ export function AssistantMessage({ toolCalls: ToolCallState[]; thinkingBlocks: ThinkingBlockState[]; }) { - const textContent = - message.content.find((b) => b.type === "text")?.text ?? ""; + const textBlocks = getAssistantTextBlocks(message); + const hasText = textBlocks.length > 0; const isStreaming = message.status === "streaming"; @@ -28,9 +29,11 @@ export function AssistantMessage({ ))} - {textContent && } + {textBlocks.map((textContent, index) => ( + + ))} - {isStreaming && !textContent && toolCalls.length === 0 && ( + {isStreaming && !hasText && toolCalls.length === 0 && (
diff --git a/chat-ui/src/components/chat-input.tsx b/chat-ui/src/components/chat-input.tsx index cdb778c8..261941de 100644 --- a/chat-ui/src/components/chat-input.tsx +++ b/chat-ui/src/components/chat-input.tsx @@ -15,7 +15,7 @@ export function ChatInput({ onRemoveFile, initialText, }: { - onSend: (text: string) => void; + onSend: (text: string) => boolean | void | Promise; onStop: () => void; isStreaming: boolean; disabled?: boolean; @@ -25,6 +25,7 @@ export function ChatInput({ initialText?: string; }) { const [text, setText] = useState(initialText ?? ""); + const [isSubmitting, setIsSubmitting] = useState(false); const textareaRef = useRef(null); const composingRef = useRef(false); const fileInputRef = useRef(null); @@ -46,15 +47,21 @@ export function ChatInput({ } }, [initialText]); - const handleSend = useCallback(() => { + const handleSend = useCallback(async () => { const trimmed = text.trim(); - if (!trimmed || isStreaming) return; - onSend(trimmed); - setText(""); - if (textareaRef.current) { - textareaRef.current.style.height = "auto"; + if (!trimmed || isStreaming || disabled || isSubmitting) return; + setIsSubmitting(true); + try { + const sent = await onSend(trimmed); + if (sent === false) return; + setText(""); + if (textareaRef.current) { + textareaRef.current.style.height = "auto"; + } + } finally { + setIsSubmitting(false); } - }, [text, isStreaming, onSend]); + }, [text, isStreaming, disabled, isSubmitting, onSend]); const handleKeyDown = useCallback( (e: KeyboardEvent) => { @@ -120,7 +127,7 @@ export function ChatInput({ }} placeholder="Send a message..." rows={1} - disabled={disabled} + disabled={disabled || isSubmitting} enterKeyHint="send" className="max-h-[200px] min-h-[36px] flex-1 resize-none bg-transparent px-2 py-1.5 text-sm text-foreground placeholder:text-muted-foreground focus:outline-none" aria-label="Message input" @@ -141,7 +148,7 @@ export function ChatInput({ variant="ghost" size="icon" onClick={handleSend} - disabled={!text.trim() || disabled} + disabled={!text.trim() || disabled || isSubmitting} className="h-8 w-8 shrink-0 rounded-lg bg-primary text-primary-content hover:bg-primary/90 disabled:opacity-50" aria-label="Send message" > diff --git a/chat-ui/src/components/user-message.tsx b/chat-ui/src/components/user-message.tsx index c04f3a8d..6ed80aa8 100644 --- a/chat-ui/src/components/user-message.tsx +++ b/chat-ui/src/components/user-message.tsx @@ -1,14 +1,67 @@ import type { ChatMessage } from "@/lib/chat-types"; +import { File, FileText } from "lucide-react"; export function UserMessage({ message }: { message: ChatMessage }) { const text = message.content.find((b) => b.type === "text")?.text ?? ""; + const attachments = message.attachments ?? []; return (
-

{text}

+ {attachments.length > 0 && ( + + )} + {text &&

{text}

}
); } + +function AttachmentIcon({ + mimeType, + previewUrl, + filename, +}: { + mimeType: string; + previewUrl: string; + filename: string; +}) { + if (mimeType.startsWith("image/")) { + return ( + + {filename} + + ); + } + if (mimeType === "application/pdf") { + return ; + } + if (mimeType.startsWith("text/") || mimeType === "application/json") { + return ; + } + return ; +} + +function formatBytes(size: number): string { + if (size < 1024) return `${size} B`; + if (size < 1024 * 1024) return `${Math.round(size / 1024)} KB`; + return `${(size / (1024 * 1024)).toFixed(1)} MB`; +} diff --git a/chat-ui/src/hooks/use-attachments.ts b/chat-ui/src/hooks/use-attachments.ts index 05e36b8a..5bdd5fe8 100644 --- a/chat-ui/src/hooks/use-attachments.ts +++ b/chat-ui/src/hooks/use-attachments.ts @@ -23,18 +23,29 @@ export type PendingAttachment = { export type AttachmentResult = { id: string; + client_id?: string; filename: string; mime_type: string; size: number; preview_url: string; }; +export type UploadFilesResult = { + intendedCount: number; + acceptedIds: string[]; + failedCount: number; +}; + +export function shouldBlockSendAfterUpload(result: UploadFilesResult): boolean { + return result.intendedCount > 0 && result.failedCount > 0; +} + export function useAttachments(): { files: PendingAttachment[]; addFiles: (newFiles: File[]) => void; removeFile: (id: string) => void; clearFiles: () => void; - uploadFiles: (sessionId: string) => Promise; + uploadFiles: (sessionId: string) => Promise; hasFiles: boolean; isUploading: boolean; } { @@ -103,15 +114,31 @@ export function useAttachments(): { }, []); const uploadFiles = useCallback( - async (sessionId: string): Promise => { - const pending = filesRef.current.filter((f) => f.status === "pending"); - if (pending.length === 0) return []; + async (sessionId: string): Promise => { + const current = filesRef.current; + const intendedCount = current.length; + const alreadyAcceptedIds = current + .map((file) => (file.status === "done" ? file.serverId : undefined)) + .filter((id): id is string => typeof id === "string" && id.length > 0); + const pending = current.filter((f) => f.serverId === undefined && (f.status === "pending" || f.status === "error")); + if (intendedCount === 0) { + return { intendedCount: 0, acceptedIds: [], failedCount: 0 }; + } + if (pending.length === 0) { + return { + intendedCount, + acceptedIds: alreadyAcceptedIds, + failedCount: intendedCount - alreadyAcceptedIds.length, + }; + } - setFiles((prev) => prev.map((f) => (f.status === "pending" ? { ...f, status: "uploading" as const } : f))); + const pendingIds = new Set(pending.map((file) => file.id)); + setFiles((prev) => prev.map((f) => (pendingIds.has(f.id) ? { ...f, status: "uploading" as const } : f))); const formData = new FormData(); for (const p of pending) { formData.append("file", p.file); + formData.append("client_id", p.id); } try { @@ -133,12 +160,12 @@ export function useAttachments(): { prev.map((f) => (f.status === "uploading" ? { ...f, status: "error" as const } : f)), ); toast.error(errorMsg); - return []; + return { intendedCount, acceptedIds: alreadyAcceptedIds, failedCount: pending.length }; } const body = (await res.json()) as { attachments?: AttachmentResult[]; - rejected?: Array<{ filename: string; reason: string; message: string }>; + rejected?: Array<{ client_id?: string; filename: string; reason: string; message: string }>; }; if (body.rejected) { @@ -147,19 +174,40 @@ export function useAttachments(): { } } - const serverIds = (body.attachments ?? []).map((a) => a.id); + const accepted = body.attachments ?? []; + const acceptedByClientId = new Map( + accepted + .filter((attachment) => typeof attachment.client_id === "string") + .map((attachment) => [attachment.client_id as string, attachment.id] as const), + ); + const rejectedIds = new Set( + (body.rejected ?? []) + .map((rejection) => rejection.client_id) + .filter((id): id is string => typeof id === "string" && id.length > 0), + ); setFiles((prev) => - prev.map((f) => (f.status === "uploading" ? { ...f, status: "done" as const } : f)), + prev.map((f) => { + if (!pendingIds.has(f.id)) return f; + const serverId = acceptedByClientId.get(f.id); + if (serverId) { + return { ...f, status: "done" as const, serverId }; + } + return { ...f, status: "error" as const }; + }), ); - return serverIds; + const acceptedIds = [...alreadyAcceptedIds, ...accepted.map((attachment) => attachment.id)]; + const failedCount = pending.filter( + (file) => !acceptedByClientId.has(file.id) || rejectedIds.has(file.id), + ).length; + return { intendedCount, acceptedIds, failedCount }; } catch { setFiles((prev) => prev.map((f) => (f.status === "uploading" ? { ...f, status: "error" as const } : f)), ); toast.error("Upload failed. Please try again."); - return []; + return { intendedCount, acceptedIds: alreadyAcceptedIds, failedCount: pending.length }; } }, [], diff --git a/chat-ui/src/hooks/use-chat.ts b/chat-ui/src/hooks/use-chat.ts index 5b4ceee5..54d143f0 100644 --- a/chat-ui/src/hooks/use-chat.ts +++ b/chat-ui/src/hooks/use-chat.ts @@ -1,4 +1,5 @@ import { runTimelineSummaryToView } from "@/lib/chat-activity"; +import { parseMessageContentJson } from "@/lib/chat-message-content"; import { type ChatStore, beginRunActivity, createChatStore, dispatchFrame } from "@/lib/chat-store"; import type { ChatMessage, @@ -265,28 +266,13 @@ function buildTimelineViewMap(detail: SessionDetail): Map = []; - try { - const parsed = JSON.parse(row.content_json); - if (typeof parsed === "string") { - contentBlocks = [{ type: "text", text: parsed }]; - } else if (Array.isArray(parsed)) { - contentBlocks = parsed; - } else { - contentBlocks = [parsed]; - } - } catch { - contentBlocks = [{ type: "text", text: row.content_json }]; - } + const parsed = parseMessageContentJson(row.content_json, row.role); return { id: row.id, role: row.role as "user" | "assistant", - content: contentBlocks, + content: parsed.contentBlocks, + attachments: parsed.attachments, createdAt: row.created_at, status: row.status as "committed" | "streaming" | "error", stopReason: row.stop_reason, diff --git a/chat-ui/src/lib/__tests__/chat-store.test.ts b/chat-ui/src/lib/__tests__/chat-store.test.ts index fc8a6caf..8949e3f4 100644 --- a/chat-ui/src/lib/__tests__/chat-store.test.ts +++ b/chat-ui/src/lib/__tests__/chat-store.test.ts @@ -1,5 +1,7 @@ import { describe, expect, it } from "vitest"; +import { shouldBlockSendAfterUpload } from "../../hooks/use-attachments"; import { ACTIVE_RUN_MESSAGE_ID } from "../chat-activity"; +import { getAssistantTextBlocks } from "../chat-message-content"; import { beginRunActivity, createChatStore, dispatchFrame } from "../chat-store"; function send(store: ReturnType, event: string, data: Record): void { @@ -127,6 +129,134 @@ describe("chat-store reducer: text block lifecycle", () => { expect(store.getState().thinkingBlocks.get("tk_0_0")?.isStreaming).toBe(false); }); + + it("preserves multiple assistant text blocks for rendering", () => { + const store = createChatStore(); + send(store, "message.assistant_start", { message_id: "a1" }); + send(store, "message.text_start", { + message_id: "a1", + text_block_id: "tb_0_0", + index: 0, + }); + send(store, "message.text_delta", { + text_block_id: "tb_0_0", + delta: "Before the tool.", + }); + send(store, "message.text_end", { text_block_id: "tb_0_0" }); + send(store, "message.text_start", { + message_id: "a1", + text_block_id: "tb_0_2", + index: 2, + }); + send(store, "message.text_delta", { + text_block_id: "tb_0_2", + delta: "After the tool.", + }); + send(store, "message.text_end", { text_block_id: "tb_0_2" }); + + const assistant = store.getState().messages[0]; + expect(assistant?.content.filter((block) => block.type === "text")).toHaveLength(2); + if (assistant) { + expect(getAssistantTextBlocks(assistant)).toEqual(["Before the tool.", "After the tool."]); + } + }); + + it("session.error marks a previously ended assistant row as error", () => { + const store = createChatStore(); + send(store, "message.assistant_start", { message_id: "a1" }); + send(store, "message.text_start", { + message_id: "a1", + text_block_id: "tb_0_0", + index: 0, + }); + send(store, "message.text_delta", { + text_block_id: "tb_0_0", + delta: "Partial answer", + }); + send(store, "message.assistant_end", { + message_id: "a1", + interrupted: false, + }); + expect(store.getState().messages[0]?.status).toBe("committed"); + + send(store, "session.error", { + session_id: "s1", + message_id: "a1", + subtype: "error_during_execution", + recoverable: false, + errors: ["Provider failed"], + cost_usd: 0, + duration_ms: 10, + }); + + expect(store.getState().messages[0]?.status).toBe("error"); + }); +}); + +describe("chat-store reducer: user attachments", () => { + it("preserves attachment metadata from user.message frames", () => { + const store = createChatStore(); + send(store, "user.message", { + message_id: "u1", + text: "Review this.", + attachments: [ + { + id: "att-1", + filename: "brief.pdf", + mime_type: "application/pdf", + size_bytes: 1234, + preview_url: "/chat/attachments/att-1/preview", + }, + ], + sent_at: "2026-04-30T00:00:00.000Z", + source_tab_id: "tab-1", + }); + + const user = store.getState().messages[0]; + expect(user?.attachments).toEqual([ + { + id: "att-1", + filename: "brief.pdf", + mimeType: "application/pdf", + sizeBytes: 1234, + previewUrl: "/chat/attachments/att-1/preview", + }, + ]); + }); + + it("replayed user.message attachments are idempotent", () => { + const store = createChatStore(); + const frame = { + message_id: "u1", + text: "Review this.", + attachments: [ + { + id: "att-1", + filename: "brief.pdf", + mime_type: "application/pdf", + size_bytes: 1234, + preview_url: "/chat/attachments/att-1/preview", + }, + ], + sent_at: "2026-04-30T00:00:00.000Z", + source_tab_id: "tab-1", + }; + replay(store, "user.message", frame); + replay(store, "user.message", frame); + + const user = store.getState().messages[0]; + expect(user?.attachments).toHaveLength(1); + expect(user?.attachments?.[0]?.id).toBe("att-1"); + }); +}); + +describe("attachment upload send gate", () => { + it("distinguishes no files from failed intended uploads", () => { + expect(shouldBlockSendAfterUpload({ intendedCount: 0, acceptedIds: [], failedCount: 0 })).toBe(false); + expect(shouldBlockSendAfterUpload({ intendedCount: 1, acceptedIds: [], failedCount: 1 })).toBe(true); + expect(shouldBlockSendAfterUpload({ intendedCount: 2, acceptedIds: ["att-1"], failedCount: 1 })).toBe(true); + expect(shouldBlockSendAfterUpload({ intendedCount: 1, acceptedIds: ["att-1"], failedCount: 0 })).toBe(false); + }); }); describe("chat-store reducer: run activity", () => { diff --git a/chat-ui/src/lib/chat-message-content.ts b/chat-ui/src/lib/chat-message-content.ts new file mode 100644 index 00000000..be520ab2 --- /dev/null +++ b/chat-ui/src/lib/chat-message-content.ts @@ -0,0 +1,75 @@ +import type { ChatAttachmentView, ChatMessage, ContentBlock } from "./chat-types"; + +export type ParsedMessageContent = { + contentBlocks: ContentBlock[]; + attachments?: ChatAttachmentView[]; +}; + +export function parseMessageContentJson(contentJson: string, role: string): ParsedMessageContent { + try { + const parsed = JSON.parse(contentJson); + if (typeof parsed === "string") { + return { contentBlocks: [{ type: "text", text: parsed }] }; + } + if (Array.isArray(parsed)) { + return normalizeDurableContent(parsed, role); + } + return normalizeDurableContent([parsed], role); + } catch { + return { contentBlocks: [{ type: "text", text: contentJson }] }; + } +} + +export function getAssistantTextBlocks(message: Pick): string[] { + return message.content + .filter((block) => block.type === "text" && typeof block.text === "string" && block.text.length > 0) + .map((block) => block.text as string); +} + +function normalizeDurableContent(blocks: unknown[], role: string): ParsedMessageContent { + if (role !== "user") { + return { contentBlocks: blocks.filter(isRecord).map(recordToContentBlock) }; + } + const contentBlocks: ContentBlock[] = []; + const attachments: ChatAttachmentView[] = []; + for (const block of blocks) { + if (!isRecord(block)) continue; + if (block.type === "attachment") { + const attachment = normalizeDurableAttachment(block); + if (attachment) attachments.push(attachment); + continue; + } + if (block.type === "text") { + contentBlocks.push({ type: "text", text: typeof block.text === "string" ? block.text : "" }); + } + } + return { contentBlocks, attachments: attachments.length > 0 ? attachments : undefined }; +} + +function recordToContentBlock(block: Record): ContentBlock { + return { ...block, type: typeof block.type === "string" ? block.type : "text" }; +} + +function normalizeDurableAttachment(block: Record): ChatAttachmentView | null { + if (typeof block.id !== "string" || block.id.length === 0) return null; + const filename = typeof block.filename === "string" && block.filename.length > 0 ? block.filename : "file"; + const mimeType = + typeof block.mime_type === "string" + ? block.mime_type + : typeof block.mimeType === "string" + ? block.mimeType + : "application/octet-stream"; + const sizeValue = block.size_bytes ?? block.sizeBytes; + const previewValue = block.preview_url ?? block.previewUrl; + return { + id: block.id, + filename, + mimeType, + sizeBytes: typeof sizeValue === "number" ? sizeValue : null, + previewUrl: typeof previewValue === "string" ? previewValue : `/chat/attachments/${block.id}/preview`, + }; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} diff --git a/chat-ui/src/lib/chat-store.ts b/chat-ui/src/lib/chat-store.ts index e378a8bd..840a2392 100644 --- a/chat-ui/src/lib/chat-store.ts +++ b/chat-ui/src/lib/chat-store.ts @@ -11,7 +11,7 @@ import { dispatchToolRunning, dispatchToolStart, } from "./chat-dispatch-tools"; -import type { ChatMessage, ChatState } from "./chat-types"; +import type { ChatAttachmentView, ChatMessage, ChatState } from "./chat-types"; type Listener = () => void; @@ -81,16 +81,55 @@ function upsertMessage(messages: ChatMessage[], message: ChatMessage): ChatMessa const next = [...messages]; const existing = next[existingIndex]; if (!existing) return messages; + const nextAttachments = mergeAttachments(existing.attachments, message.attachments); next[existingIndex] = { ...existing, ...message, content: existing.content.length > 0 ? existing.content : message.content, + attachments: nextAttachments, status: existing.status === "committed" ? existing.status : message.status, runTimeline: existing.runTimeline ?? message.runTimeline, }; return next; } +function mergeAttachments( + existing: ChatAttachmentView[] | undefined, + incoming: ChatAttachmentView[] | undefined, +): ChatAttachmentView[] | undefined { + if (!existing || existing.length === 0) return incoming; + if (!incoming || incoming.length === 0) return existing; + const seen = new Set(existing.map((attachment) => attachment.id)); + return [...existing, ...incoming.filter((attachment) => !seen.has(attachment.id))]; +} + +function normalizeAttachment(value: unknown): ChatAttachmentView | null { + if (typeof value !== "object" || value === null || Array.isArray(value)) return null; + const record = value as Record; + if (typeof record.id !== "string" || record.id.length === 0) return null; + const filename = typeof record.filename === "string" && record.filename.length > 0 ? record.filename : "file"; + const mimeType = + typeof record.mime_type === "string" + ? record.mime_type + : typeof record.mimeType === "string" + ? record.mimeType + : "application/octet-stream"; + const sizeValue = record.size_bytes ?? record.sizeBytes; + const previewValue = record.preview_url ?? record.previewUrl; + return { + id: record.id, + filename, + mimeType, + sizeBytes: typeof sizeValue === "number" ? sizeValue : null, + previewUrl: typeof previewValue === "string" ? previewValue : `/chat/attachments/${record.id}/preview`, + }; +} + +function normalizeAttachments(value: unknown): ChatAttachmentView[] { + if (!Array.isArray(value)) return []; + return value.map(normalizeAttachment).filter((attachment): attachment is ChatAttachmentView => attachment !== null); +} + function updateTextBlockInContent(s: ChatState, blockId: string, updater: (text: string) => string): ChatState { const block = s.textBlocks.get(blockId); if (!block) return s; @@ -136,6 +175,7 @@ export function dispatchFrame( id: data.message_id as string, role: "user" as const, content: [{ type: "text", text: data.text as string }], + attachments: normalizeAttachments(data.attachments), createdAt: data.sent_at as string, status: "committed" as const, }), @@ -323,9 +363,12 @@ export function dispatchFrame( .findIndex((message) => message.role === "assistant" && message.status === "streaming"); const normalizedIndex = messageId !== null ? index : index >= 0 ? msgs.length - 1 - index : -1; const target = normalizedIndex >= 0 ? msgs[normalizedIndex] : undefined; - if (target && target.role === "assistant" && target.status === "streaming") { - const newStatus = event === "session.error" ? "error" : "committed"; - msgs[normalizedIndex] = { ...target, status: newStatus }; + if (target && target.role === "assistant") { + if (event === "session.error") { + msgs[normalizedIndex] = { ...target, status: "error" }; + } else if (target.status === "streaming") { + msgs[normalizedIndex] = { ...target, status: "committed" }; + } } return { ...s, diff --git a/chat-ui/src/lib/chat-types.ts b/chat-ui/src/lib/chat-types.ts index dd3e0bf5..6e9b7488 100644 --- a/chat-ui/src/lib/chat-types.ts +++ b/chat-ui/src/lib/chat-types.ts @@ -18,10 +18,19 @@ export type ContentBlock = { [key: string]: unknown; }; +export type ChatAttachmentView = { + id: string; + filename: string; + mimeType: string; + sizeBytes: number | null; + previewUrl: string; +}; + export type ChatMessage = { id: string; role: "user" | "assistant"; content: ContentBlock[]; + attachments?: ChatAttachmentView[]; createdAt: string; status: "committed" | "streaming" | "error"; stopReason?: string | null; diff --git a/chat-ui/src/lib/client.ts b/chat-ui/src/lib/client.ts index 8cb4a868..fa019026 100644 --- a/chat-ui/src/lib/client.ts +++ b/chat-ui/src/lib/client.ts @@ -129,16 +129,25 @@ export function abortSession(id: string): Promise { }).then(() => undefined); } -export function sendMessage(sessionId: string, text: string, tabId: string): ReadableStream { +export function sendMessage( + sessionId: string, + text: string, + tabId: string, + attachmentIds?: string[], +): ReadableStream { const controller = new AbortController(); const stream = new ReadableStream({ async start(streamController) { try { + const body: Record = { session_id: sessionId, text, tab_id: tabId }; + if (attachmentIds && attachmentIds.length > 0) { + body.attachment_ids = attachmentIds; + } const res = await fetch("/chat/stream", { method: "POST", credentials: "include", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session_id: sessionId, text, tab_id: tabId }), + body: JSON.stringify(body), signal: controller.signal, }); if (!res.ok || !res.body) { diff --git a/chat-ui/src/routes/session-route.tsx b/chat-ui/src/routes/session-route.tsx index 6a7d9642..348594b6 100644 --- a/chat-ui/src/routes/session-route.tsx +++ b/chat-ui/src/routes/session-route.tsx @@ -3,13 +3,14 @@ import { DropOverlay } from "@/components/drop-overlay"; import { IosInstallBanner } from "@/components/ios-install-banner"; import { MessageList } from "@/components/message-list"; import { NotificationBanner } from "@/components/notification-banner"; -import { useAttachments } from "@/hooks/use-attachments"; +import { shouldBlockSendAfterUpload, useAttachments } from "@/hooks/use-attachments"; import { useChat } from "@/hooks/use-chat"; import { useDragDrop } from "@/hooks/use-drag-drop"; import { useFocusHeartbeat } from "@/hooks/use-focus-heartbeat"; import { usePaste } from "@/hooks/use-paste"; import { useCallback, useEffect, useRef, useState } from "react"; import { useLocation, useParams } from "react-router-dom"; +import { toast } from "sonner"; export function SessionRoute() { const { sessionId } = useParams<{ sessionId: string }>(); @@ -56,13 +57,22 @@ export function SessionRoute() { }, [sessionId, location.state, location.pathname, sendMessage]); const handleSend = useCallback( - async (text: string) => { - if (!sessionId) return; - const attachmentIds = await uploadFiles(sessionId); - if (attachmentIds.length > 0) clearFiles(); - sendMessage(text, attachmentIds.length > 0 ? attachmentIds : undefined); + async (text: string): Promise => { + if (!sessionId) return false; + const uploadResult = await uploadFiles(sessionId); + if (shouldBlockSendAfterUpload(uploadResult)) { + const message = + uploadResult.acceptedIds.length === 0 + ? "No files uploaded. Your message was not sent." + : "Some files did not upload. Your message was not sent."; + toast.error(message); + return false; + } + if (uploadResult.acceptedIds.length > 0) clearFiles(); + sendMessage(text, uploadResult.acceptedIds.length > 0 ? uploadResult.acceptedIds : undefined); sentCountRef.current++; setHasSentMessage(true); + return true; }, [sessionId, uploadFiles, clearFiles, sendMessage], ); diff --git a/prompts/phase-10h-chat-integrity-builder.md b/prompts/phase-10h-chat-integrity-builder.md new file mode 100644 index 00000000..2fa46354 --- /dev/null +++ b/prompts/phase-10h-chat-integrity-builder.md @@ -0,0 +1,117 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product manager at Anthropic, all three at once. You are implementing the Phase 10H Phantom chat transcript-integrity slice in `/Users/truffle/work/phantom-murph-hardening`. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No "v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Mission + +Make the Phantom chat transcript trustworthy enough for production Murph use before we add more ambitious polish. The UI can be beautiful only if the underlying conversation record is honest, durable, and replayable. + +This slice focuses on correctness, not broad redesign: + +1. Runtime or SDK result errors must not become empty successful assistant rows. +2. User attachment metadata must persist without storing raw base64 payloads in chat message content. +3. Sent user-message attachments must appear live and after reload. +4. Upload failures must not silently send a prompt without the intended files. +5. Assistant messages with multiple text blocks must render all text. +6. Chat input must not double-send while upload is pending. + +## Required Reading + +Read these in order: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +4. `/Users/truffle/work/murph/QUALITY-BAR.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +6. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-phantom-chat-review.md` +7. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-pi-thinking-research.md` +8. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-provider-thinking-research.md` + +Then inspect the current source directly. Agent summaries are not evidence. + +## Owned Areas + +You may edit these areas as needed: + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/` +- Focused tests under those same areas. + +Keep changes tightly scoped. Do not redesign the full chat UI in this slice. + +## Product Bar + +The transcript is a trust surface. When the agent runs a long task, creates files, uses tools, or attaches user-provided context, the UI must preserve what happened without pretending failed work succeeded. + +Good behavior: + +- The live view and reload view agree. +- Attachments show as first-class user message context. +- Invalid or failed attachments are visible to the user and block sending until fixed. +- Assistant text is not silently dropped because it arrived as multiple blocks. +- Terminal SDK errors produce visible error state, not blank success. + +Bad behavior: + +- Showing an assistant row as successful when the durable transcript will later remove it. +- Persisting base64 image or file payloads in the chat transcript. +- Sending the user's prompt without files after upload failures. +- Flattening tool or content data with brittle string parsing. +- Adding unrelated UI polish before fixing transcript correctness. + +## Implementation Notes + +Prefer structured helpers over ad hoc parsing. If the durable message content already contains structured JSON blocks, parse them centrally and render from that shape. + +Keep SDK runtime message content separate from display-safe transcript content. The model may need attachment payloads during the run; the durable chat transcript should keep metadata and previews, not raw base64. + +Make upload behavior explicit. If a user selected three files and one failed, the send path should know that and avoid silently continuing with two files. + +Terminal result errors need both backend and frontend protection. If a provider produces an error after assistant start, the backend should avoid synthetic success completion frames and the frontend should let `session.error` override any matching assistant row. + +## Acceptance Criteria + +1. Result errors after assistant start do not emit a normal assistant end frame. +2. `session.error` can mark an existing assistant row as error even if a prior event ended it. +3. User attachments are committed to message ownership and replay as attachment chips after reload. +4. Durable content JSON does not contain raw base64 data URLs from attachment payloads. +5. Invalid, wrong-session, or already-sent attachment IDs are rejected with a 400 response. +6. Upload failures block send and preserve the composer content. +7. Multiple assistant text blocks render in order. +8. TypeScript remains strict: no explicit `any`, no `@ts-ignore`, no hidden type escapes. +9. Existing chat streaming, replay, and resume behavior still passes tests. + +## Required Verification + +Run focused tests first, then enough local gates to establish confidence: + +```sh +bun test src/chat/__tests__/writer.test.ts +bun test src/chat/__tests__/http.test.ts src/chat/__tests__/message-builder.test.ts src/chat/__tests__/upload.test.ts +bun test src/chat/__tests__/sdk-to-wire.test.ts +cd chat-ui && bun test src/lib/__tests__/chat-store.test.ts +bun run lint +bun run typecheck +cd chat-ui && bun run typecheck +cd chat-ui && bun run build +git diff --check +``` + +If a command cannot run, document why and leave the work in a clean resume state. + +## Handoff Contract + +When done, report: + +1. Files changed. +2. Behavior shipped. +3. Tests run and exact pass/fail status. +4. Any residual risks. +5. Whether the diff is ready for an independent review agent. + +Do not commit. Do not push. The orchestrator will verify by reading files and running checks. + +ultrathink. The floor is correctness. Polish can only sit on top of a transcript users can trust. diff --git a/prompts/phase-10h-chat-integrity-review.md b/prompts/phase-10h-chat-integrity-review.md new file mode 100644 index 00000000..341fd0bd --- /dev/null +++ b/prompts/phase-10h-chat-integrity-review.md @@ -0,0 +1,57 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product manager at Anthropic, all three at once. You are reviewing the Phase 10H Phantom chat transcript-integrity slice in `/Users/truffle/work/phantom-murph-hardening`. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No "v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Mission + +Review the current uncommitted diff. Do not edit application code. Write your review report to: + +`/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-chat-integrity-review.md` + +Focus on bugs, regressions, missing tests, and product-trust issues. + +## Required Reading + +Read: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +4. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-phantom-chat-review.md` +6. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-pi-thinking-research.md` +7. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-provider-thinking-research.md` +8. `/Users/truffle/work/phantom-murph-hardening/prompts/phase-10h-chat-integrity-builder.md` + +Then inspect the actual diff with `git diff`, not summaries. + +## Review Scope + +Pay special attention to: + +- Runtime or SDK result errors must not become empty successful assistant rows. +- User attachment metadata must be persisted without base64 payloads. +- Sent user-message attachments must appear live and after reload. +- Upload failures must not silently send prompts without the intended files. +- Assistant messages with multiple text blocks must render all text. +- Chat input should not double-send while upload is pending. +- Attachment preview URLs must stay authenticated and safe. +- Existing chat streaming, replay, and resume semantics must not regress. +- No explicit `any`, `@ts-ignore`, hidden type escapes, or broad unrelated refactors. +- No generated build artifact should be included unless there is a clear reason. + +## Output Format + +Findings first, ordered by severity. Use P0/P1/P2/P3. Include exact file paths and line references. If there are no P0/P1/P2 findings, say that explicitly. + +Then include: + +1. Tests you ran or inspected. +2. Residual risks. +3. Whether this diff is safe to proceed to live browser verification. + +Do not make code changes. Write only the report file. + +ultrathink. Treat the transcript as a trust surface. The review should be skeptical, specific, and grounded in actual files. diff --git a/prompts/phase-10h-phantom-chat-review.md b/prompts/phase-10h-phantom-chat-review.md new file mode 100644 index 00000000..24201a42 --- /dev/null +++ b/prompts/phase-10h-phantom-chat-review.md @@ -0,0 +1,59 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are reviewing Phantom's current +chat UI and chat event pipeline for best-in-class agent experience. + +Mission: + +Audit Phantom's current chat experience after PR 109. Identify the highest +leverage fixes for durable run timeline, richer progress, thinking display, +tool-card behavior, loading/idle states, visual polish, and post-compaction +continuation. This is a review/research pass, not a builder pass. + +Required reading, in order: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/work/murph/QUALITY-BAR.md` +3. `/Users/truffle/work/murph/PROGRESS.md` +4. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10a-synthesis.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10d-chat-ui-polish-research.md` +6. Current Phantom files under: + - `src/chat/` + - `src/agent/` + - `chat-ui/src/components/` + - `chat-ui/src/lib/` + - `chat-ui/src/hooks/` + - `chat-ui/src/routes/` + +You are not alone in the codebase. Do not overwrite or revert other work. + +Deliverable: + +Write `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-phantom-chat-review.md`. + +The report must include: + +1. Findings first, ordered by severity and user impact. Include file and line + references for every finding. +2. A concrete "next builder slice" that can be implemented and verified in one + PR without ballooning scope. +3. A second slice backlog for larger work that should not block the next PR. +4. UI polish recommendations grounded in the existing design system and the + screenshots Cheema shared: tool cards collapsed by default, useful expanded + content, run activity while tools execute, top/border/input polish, no dead + feeling during long work. +5. Testing plan: focused unit tests, chat-ui tests if present, production + build, and live browser verification flows against Murph. + +Non-goals: + +- Do not edit application code. +- Do not commit, push, or open a PR. +- Do not make generic design advice. Tie every recommendation to a real + Phantom file or observed behavior. + +Self-review: + +Before finishing, verify each finding by reading the actual source. Do not rely +on prior summaries. diff --git a/prompts/phase-10h-pi-thinking-research.md b/prompts/phase-10h-pi-thinking-research.md new file mode 100644 index 00000000..95da7013 --- /dev/null +++ b/prompts/phase-10h-pi-thinking-research.md @@ -0,0 +1,68 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are researching how Phantom should +render agent thinking, tool progress, and long-running activity now that +Phantom runs on Murph, which runs on pi-mono. + +Mission: + +Find what Pi and Pi-adjacent code already provide for thinking display, +progress display, activity rows, run timelines, and CLI or web rendering. Do +not let Phantom reinvent primitives Pi already gives us. If Pi does not provide +the UI primitive, identify the cleanest Phantom-owned layer. + +Required reading, in order: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/work/murph/QUALITY-BAR.md` +3. `/Users/truffle/work/murph/PROGRESS.md` +4. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10g-pi-continuity.md` +5. Local pi-mono sources under `/Users/truffle/work/pi-mono`, especially: + - `packages/agent/src/` + - `packages/ai/src/types.ts` + - `packages/ai/src/providers/` + - `packages/web-ui/README.md` + - `packages/web-ui/src/` + - `packages/mom/src/` + +External source rules: + +- Use primary sources only. GitHub repositories, official docs, package + READMEs, and source code are acceptable. +- If you need to clone another Pi/Pi Code related repository from pi.dev or + GitHub, clone it under `/Users/truffle/work/research-clones/`. +- Do not use SEO blogs, scraped docs, or tutorials as evidence. +- Do not copy implementation from any repo into Phantom. This is research and + architecture guidance only. + +Deliverable: + +Write `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-pi-thinking-research.md`. + +The report must include: + +1. Source inventory: exact repos/files read and why they matter. +2. How Pi represents thinking, streaming text, tool calls, tool progress, + usage, and completion. +3. Whether Pi has a CLI or web rendering pattern we should reuse directly, + adapt conceptually, or ignore. +4. What Phantom should own versus what Murph/Pi should own. +5. Specific recommendations for Phantom's chat UI, ordered by impact. +6. Risks and anti-patterns, especially places where a custom UI could break + provider protocol or leak unsafe content. +7. Concrete acceptance criteria for the next builder slice. + +Non-goals: + +- Do not edit Phantom application code. +- Do not change Murph code. +- Do not commit, push, or open a PR. +- Do not create a broad open-source polish plan. Stay focused on chat + experience and thinking/progress rendering. + +Self-review: + +Before finishing, re-read your report and verify every factual claim has a +local file path or primary-source URL. If a claim is an inference, label it as +an inference. diff --git a/prompts/phase-10h-provider-thinking-research.md b/prompts/phase-10h-provider-thinking-research.md new file mode 100644 index 00000000..788fc66b --- /dev/null +++ b/prompts/phase-10h-provider-thinking-research.md @@ -0,0 +1,72 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are researching provider thinking +and reasoning event support across Murph, Pi, Phantom, OpenAI, Anthropic, and +ZAI. + +Mission: + +Determine what Phantom can truthfully show as "thinking" or "reasoning" today +without lying to the user or exposing unsafe private chain-of-thought. We need +best-in-class progress, but it must respect provider semantics. + +Required reading, in order: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/work/murph/QUALITY-BAR.md` +3. `/Users/truffle/work/murph/PROGRESS.md` +4. Murph source: + - `/Users/truffle/work/murph/packages/core/src/events/` + - `/Users/truffle/work/murph/packages/core/src/substrate/` + - `/Users/truffle/work/murph/packages/core/src/providers/` + - `/Users/truffle/work/murph/packages/anthropic-sdk-shim/src/` +5. Phantom source: + - `src/chat/sdk-to-wire.ts` + - `src/chat/sdk-to-wire-handlers.ts` + - `src/chat/types.ts` + - `src/chat/run-timeline.ts` + - `chat-ui/src/components/thinking-block.tsx` + - `chat-ui/src/components/message.tsx` + - `chat-ui/src/lib/chat-types.ts` +6. Pi source: + - `/Users/truffle/work/pi-mono/packages/ai/src/types.ts` + - `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses.ts` + - `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts` + - `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-completions.ts` + - `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts` + +External docs: + +- Use official provider docs only if needed. Prefer OpenAI and Anthropic + official docs, ZAI official docs or source-backed docs. +- Do not use SEO sites or tutorials. + +Deliverable: + +Write `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-provider-thinking-research.md`. + +The report must include: + +1. Event taxonomy: what events we currently have for thinking, redacted + thinking, progress summaries, token usage, tool calls, and rate limits. +2. Provider capability matrix for OpenAI, Anthropic, and ZAI as routed through + Pi/Murph. +3. What can be shown verbatim, what should be summarized, and what must be + hidden or labeled as private/redacted. +4. Whether Phantom can show thinking tokens/counts today, and if so where the + data comes from. If not, name the missing Murph/Pi event or field. +5. Recommended UI model for "thinking" that is honest across providers. +6. Tests needed before shipping thinking/progress UI changes. + +Non-goals: + +- Do not edit code. +- Do not print or inspect API key values. +- Do not run costly live model loops unless absolutely needed. This is a source + and event-contract research pass. + +Self-review: + +Before finishing, check that each recommendation maps to an existing event or a +clearly named event-contract gap. diff --git a/research/chat-experience/phase-10h-chat-integrity-review.md b/research/chat-experience/phase-10h-chat-integrity-review.md new file mode 100644 index 00000000..32e3886c --- /dev/null +++ b/research/chat-experience/phase-10h-chat-integrity-review.md @@ -0,0 +1,50 @@ +# Phase 10H Chat Integrity Review + +## Findings + +### P1: Live result errors can still leave a committed assistant row + +`/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts:214` emits `message.assistant_end` for any SDK `result` when an assistant has started, before the subtype is checked at `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts:228`. The live reducer then treats that `message.assistant_end` as success by changing the streaming assistant row to `committed` at `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts:332`. When the following `session.error` arrives, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts:351` only changes the row to `error` if it is still `streaming`, so the already committed row stays visually successful. + +Impact: an SDK run that emits `assistant_start` or partial assistant text and then ends with a non-success `result` can still show a successful assistant row in the live transcript. The backend skip at `/Users/truffle/work/phantom-murph-hardening/src/chat/writer.ts:148` prevents durable assistant commit, so reload later removes that row. That creates a live versus reload trust mismatch and still violates the requirement that runtime or SDK result errors must not become empty successful assistant rows. + +The current coverage misses this shape. `/Users/truffle/work/phantom-murph-hardening/src/chat/__tests__/writer.test.ts:231` covers a non-success result with no prior assistant event, and `/Users/truffle/work/phantom-murph-hardening/src/chat/__tests__/sdk-to-wire.test.ts:402` only asserts that a result error creates `session.error`. + +Fix direction: for non-success SDK results, either do not emit a normal `message.assistant_end` before `session.error`, or make `session.error` override the assistant row to `error` even after `assistant_end` marked it committed. Add reducer and writer coverage for `assistant_start -> result error` and `assistant_start -> text -> result error`. + +No P0 or P2 findings found. + +## Tests Ran Or Inspected + +- `bun test src/chat/__tests__/writer.test.ts src/chat/__tests__/message-builder.test.ts src/chat/__tests__/http.test.ts`, 40 pass. +- `bun test src/chat/__tests__/sdk-to-wire.test.ts src/chat/__tests__/run-timeline.test.ts src/chat/__tests__/http-resume.test.ts`, 72 pass. +- `cd chat-ui && bun test src/lib/__tests__/chat-store.test.ts`, 26 pass. +- `bun run typecheck`, pass. +- `cd chat-ui && bun run typecheck`, pass. +- `bun run lint`, pass for backend `src/`. +- `cd chat-ui && bun run build`, pass. Vite reported unresolved `/chat/fonts/...` runtime font references and the existing chunk-size warning. +- `git diff --check`, pass. +- `cd chat-ui && bun run lint` was attempted, but `chat-ui/package.json` has no `lint` script. + +## Residual Risks + +- The required reading file `/Users/truffle/work/phantom-murph-hardening/prompts/phase-10h-chat-integrity-builder.md` is not present on disk. I searched the prompts and chat-experience research directories and continued from the actual diff plus the available Phase 10H materials. +- I did not perform live browser verification. +- The main remaining untested path is the live reducer sequence where an assistant row starts before a terminal SDK result error. + +## Live Browser Verification + +Not safe to proceed as the candidate diff yet. Fix the P1 live transcript error-row issue first, then run live browser verification against the repaired slice. + +## Re-Review + +P1 resolved. + +Evidence: `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts:214` now gates the synthetic `message.assistant_end` on `subtype === "success"`, so a non-success SDK `result` after assistant start emits `session.error` without first producing a normal assistant completion frame. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts:366` now lets `session.error` mark the matching assistant row `error` regardless of whether that row was already `streaming` or `committed`. + +Targeted coverage is present. `/Users/truffle/work/phantom-murph-hardening/src/chat/__tests__/sdk-to-wire.test.ts:418` asserts that a result error after assistant start does not emit `message.assistant_end`. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/__tests__/chat-store.test.ts:164` asserts that `session.error` changes a previously ended committed assistant row to `error`. + +Re-review tests: + +- `bun test src/chat/__tests__/sdk-to-wire.test.ts`, 58 pass. +- `cd chat-ui && bun test src/lib/__tests__/chat-store.test.ts`, 27 pass. diff --git a/research/chat-experience/phase-10h-live-verification.md b/research/chat-experience/phase-10h-live-verification.md new file mode 100644 index 00000000..02bf40a3 --- /dev/null +++ b/research/chat-experience/phase-10h-live-verification.md @@ -0,0 +1,66 @@ +# Phase 10H Live Verification + +Date: 2026-05-01 + +## Scope + +Verified the Phase 10H transcript-integrity slice against a real local Phantom server running on Murph with OpenAI. + +Server shape: + +- Phantom repo: `/Users/truffle/work/phantom-murph-hardening` +- Branch: `codex/chat-experience-polish-10h` +- Local URL: `http://127.0.0.1:3133` +- Runtime: Murph via `/Users/truffle/work/murph/packages/anthropic-sdk-shim/dist/index.js` +- Provider: OpenAI +- Model configured in Phantom: `gpt-5.5` + +## Test + +1. Started Phantom locally with Murph and OpenAI. +2. Built the chat UI and copied the generated SPA into `public/chat` as a temporary local artifact. +3. Authenticated through the UI login flow using a short-lived bootstrap token. +4. Created a new chat session. +5. Attached `/tmp/phantom-phase10h-attachment-smoke.txt`. +6. Sent: "Use the attached text file. Answer in exactly one sentence and include the marker word from the file." +7. Waited for the real model response. +8. Reloaded the page and verified durable replay. +9. Inspected SQLite rows for the session. + +Session id: + +`2c4b3f1c-5857-4105-a85c-7cf2a04444e4` + +Screenshots: + +- `/tmp/phantom-phase10h-live-after-send.png` +- `/tmp/phantom-phase10h-live-after-reload.png` + +## Results + +Pass: + +- User attachment chip appeared before send. +- The model used the attached file and answered with `marker-lima-742`. +- The attachment chip survived reload. +- The assistant answer survived reload. +- Tool cards rendered in the run timeline and stayed collapsed by default after reload. +- Session detail contained two durable messages, one user and one assistant. +- The user message `content_json` contained the attachment metadata and prompt text. +- The user message `content_json` did not contain raw base64. +- The user message `content_json` did not contain the file-only marker text. +- The assistant answer was stored in `content_json` as `"The marker word is marker-lima-742."`. + +Observed but not blocking: + +- Browser console showed two `503` responses from push-notification endpoints. This matches local push being unconfigured and did not affect chat send, stream, reload, or transcript integrity. +- The Playwright API request context did not reflect the UI auth state, so the live test used browser-side `fetch` after login. The UI was authenticated and the session flow worked. + +## Remaining Work + +This verifies the first production-trust slice. The next chat-experience slice should focus on: + +1. Durable run timeline polish: keep tool cards collapsed by default, preserve expansion state, and make replayed timelines feel identical to live timelines. +2. Richer progress: surface a useful active-state row during long tool-heavy tasks instead of making the UI feel idle. +3. Artifact and file affordances: created pages, generated files, previews, copy/open actions, and markdown outputs should become first-class chat objects. +4. Provider matrix: repeat the small live chat smoke for Anthropic and GLM/Z.AI after the OpenAI path is committed cleanly. diff --git a/research/chat-experience/phase-10h-phantom-chat-review.md b/research/chat-experience/phase-10h-phantom-chat-review.md new file mode 100644 index 00000000..4168e0ab --- /dev/null +++ b/research/chat-experience/phase-10h-phantom-chat-review.md @@ -0,0 +1,150 @@ +# Phase 10H Phantom Chat Review + +Date: 2026-05-01 + +Scope: review only. No application code changes. + +## Findings + +### P1: SDK result errors can still become empty successful assistant rows + +`src/chat/sdk-to-wire.ts:240-250` translates a non-success SDK `result` into a `session.error` frame, but `src/agent/chat-query.ts:154-158` only records cost and keeps going unless the result is the special "No conversation found" shape. `src/chat/writer.ts:121-133` then unconditionally commits an assistant message with `response.text` and `stopReason: "end_turn"`. + +Impact: provider errors, including post-compaction overflow failures, can leave a normal empty assistant row in durable chat history. That is exactly the wrong trust signal: the run errored, but reload can show a successful blank assistant turn. + +Fix direction: make `executeChatQuery` return or throw a typed terminal error for non-success result subtypes. `ChatSessionWriter` should not commit a successful assistant message when the terminal frame is `session.error`; it should persist only the errored run timeline, or an explicit error assistant row if the product wants one. + +### P1: Assistant rendering drops all text after the first text block + +`chat-ui/src/lib/chat-store.ts:181-200` supports multiple text blocks on one assistant message. That is required for tool loops because the runtime can stream separate content blocks across a single user turn. `chat-ui/src/components/assistant-message.tsx:15-31` then uses `find((b) => b.type === "text")` and renders only the first block. + +Impact: if the assistant emits a pre-tool note and later emits the real answer in another text block, the UI can hide the actual answer. This is a direct correctness issue, not just polish. + +Fix direction: render all text blocks in order, at minimum by mapping every `message.content` text block to `Markdown`. A later richer version can preserve exact interleaving between thinking, tools, and text, but the next PR should make no text block invisible. + +### P1: Uploaded files are sent to the runtime but disappear from the chat record + +`src/chat/http-handlers.ts:102-108` builds the SDK message from `attachment_ids`, so the model can receive files. The writer then commits the user message from the SDK content at `src/chat/writer.ts:67-72`, emits `attachments: []` at `src/chat/writer.ts:76-83`, and never calls `ChatAttachmentStore.commitToMessage` from `src/chat/attachment-store.ts:54-60`. The client reducer also discards any user-frame attachments and stores only text at `chat-ui/src/lib/chat-store.ts:130-146`. + +There is a second footgun in `chat-ui/src/routes/session-route.tsx:58-64`: if upload fails and returns no IDs, the message still sends without the intended files. + +Impact: the user can attach a file, Phantom can use it, then the transcript gives no durable evidence of what was attached. On upload failure, the user may accidentally send a file-dependent prompt without the file. + +Fix direction: pass accepted attachment metadata into `ChatSessionWriter`, call `commitToMessage`, emit populated `user.message.attachments`, add attachment content to `ChatMessage`, and render user-message file chips with preview links. If all intended uploads fail, keep the composer text and block send with a clear error. + +### P2: Generated pages and files are not first-class artifacts in the UI + +The Phantom page tools already return artifact metadata. `src/ui/tools.ts:74-81` returns `path`, `url`, and `size` for `phantom_create_page`, and `src/ui/preview.ts:227-238` returns a screenshot plus JSON metadata for `phantom_preview_page`. The backend can parse this for future model context in `src/chat/continuity-context.ts:125-148`. + +The UI path loses that structure. `src/chat/run-timeline.ts:297-307` records only a safe output summary, and `summarizeToolOutput` collapses non-empty output to `"Tool produced output."` at `src/chat/run-timeline.ts:629-633`. `chat-ui/src/lib/chat-activity.ts:115-128` maps only generic tool fields, and `chat-ui/src/components/tool-call-card.tsx:229-245` renders `full_ref` and output as plain text, not as artifact actions or previews. + +Impact: a created page, preview screenshot, or generated file is treated like a raw tool log. The user has to read JSON or markdown links instead of seeing a durable artifact chip with title, URL/path, size, preview status, and open/copy actions. + +Fix direction: add a small artifact summary contract for Phantom-native tools first: page title, page URL, path, size, preview status, console issue count, failed request count, and optional screenshot reference. This belongs in Phantom built-in/tool summary plus UI affordances, not in a new MCP call. + +### P2: Tool identity conflates built-in tools, Phantom-native MCP tools, external MCP tools, and UI actions + +Phantom registers core app capabilities as in-process MCP servers in `src/index.ts:249-256`, including dynamic tools, scheduler, reflective memory, web UI, secrets, preview, and browser. The wire translator only infers MCP via string heuristics at `src/chat/sdk-to-wire-handlers.ts:141-150` and `src/chat/sdk-to-wire-handlers.ts:223-232`. For canonical `mcp__server__tool` names, `toolName.split(":")[0]` does not extract the server. The card subtitle logic handles only Claude-style built-ins like `Read`, `Write`, `Bash`, and `WebFetch` at `chat-ui/src/components/tool-call-card.tsx:28-48`. + +Impact: a Phantom page creation tool, an external MCP integration, and a generic unknown tool can all render with weak labels. Collapsed cards are less useful than they should be, and the UI cannot choose the right affordance, such as opening a page, previewing a file, showing scheduler metadata, or just labeling an external MCP call. + +Fix direction: normalize tool identity into `{ origin, serverName, rawName, displayName, capabilityKind }`. Use Phantom-native mappings for `phantom_create_page`, `phantom_preview_page`, `phantom_generate_login`, scheduler, secrets, memory, and browser. Reserve MCP labeling for external server boundaries. Use UI controls for already-known artifacts instead of forcing the agent to call another tool. + +### P2: The live run activity helps, but long tasks can still feel like detached status chrome + +`chat-ui/src/components/message-list.tsx:75-94` renders `runActivity` after all messages, separate from the current user request and the assistant answer. `chat-ui/src/components/run-activity-row.tsx:173-249` does show label, elapsed time, facts, subagents, and tool rows, which is the right foundation. The remaining issue is hierarchy: it reads as another transcript row, not the header of the current run. + +Impact: long-running tasks no longer go completely silent, but the user still has to infer which request is active and why the agent is quiet. This matters most when a tool has started but there is no assistant prose yet, during compaction, reconnect, browser preview, or long Bash work with little output. + +Fix direction: attach the active run strip to the current user message until assistant content starts, then make it the header of the assistant run. Keep elapsed time live. Prefer labels like `Running Bash`, `Previewing page`, `Compacting context`, `Waiting for permission`, and `Reconnected` over generic `Working...` once the frame data supports it. + +### P2: Markdown and code rendering are functional, not yet product-grade + +`chat-ui/src/components/markdown.tsx:6-49` only customizes code and links. It relies on `prose prose-sm`, but `chat-ui/package.json:13-29` does not include `@tailwindcss/typography`, so those typography classes may be inert depending on the Tailwind v4 setup. `chat-ui/src/components/code-block.tsx:21-45` renders a bordered `pre`, no syntax highlighting, no wrap toggle, no table handling, and copy is hidden behind hover at `chat-ui/src/components/code-block.tsx:28-32`. + +Impact: tables, long links, generated URLs, code, and citations will work at a basic level, but they do not feel like a polished developer-facing agent surface. Touch users may not discover copy controls. + +Fix direction: add explicit markdown components for tables, lists, blockquotes, links, generated `/ui/...` URLs, and code blocks. Make copy visible on touch/focus, add a wrap toggle, and keep tables horizontally scrollable with tabular numerics. + +### P3: Thinking display is safe, but under-informative and not tied to provider capability + +The current UI avoids raw chain-of-thought: `chat-ui/src/components/thinking-block.tsx:4-15` shows `Thinking...`, `Thought`, or `Reasoning hidden`. That is safer than exposing private reasoning. However, `src/chat/sdk-to-wire-handlers.ts:269-276` emits `message.thinking_end` without duration, and Murph maps thinking start/delta/end at `/Users/truffle/work/murph/packages/core/src/query/query.ts:317-330` without any UI-facing provider capability summary. + +Murph model metadata marks thinking support for OpenAI and ZAI models at `/Users/truffle/work/murph/packages/core/src/providers/models.ts:96-115` and `/Users/truffle/work/murph/packages/core/src/providers/models.ts:162-190`, but Phantom's UI does not know whether the current provider supports visible reasoning, redacted reasoning, summaries, or only effort levels. + +Impact: the UI is honest, but it cannot yet tell the user "reasoning is hidden", "reasoned for 8s", or "this provider does not expose reasoning" in a consistent way. + +Fix direction: keep raw thinking hidden by default. Add timing and provider/model capability metadata to the safe activity layer. Only show reasoning summaries when Murph has an explicit safe summary contract, not from private thinking deltas. + +## Next Builder Slice + +Recommended one-PR scope: **chat transcript integrity plus first-class user files**. + +Implement: + +1. Fix non-success SDK result handling so `session.error` cannot also create an empty successful assistant row. +2. Render every assistant text block, not only the first. +3. Persist, stream, load, and render user attachments as file chips with preview/open metadata. +4. Block send when intended attachments fail to upload. +5. Add small visual polish to the active run strip so it stays visibly attached to the current turn, without building the full artifact drawer yet. + +Acceptance criteria: + +- No empty assistant commit after SDK `result` subtype errors. +- Multi-block assistant messages show all text blocks. +- Uploaded images, PDFs, and text files appear on the sent user message live and after reload. +- Failed uploads do not silently send the prompt without files. +- A long `sleep` or page-preview run shows visible activity within one second and keeps a live elapsed timer. + +## Second Slice Backlog + +1. Artifact summaries and previews for `phantom_create_page` and `phantom_preview_page`: URL chip, open, copy, page title, size, preview screenshot, console and network issue counts. +2. Tool identity normalization: distinguish Murph/Pi built-ins, Claude-style built-ins, Phantom-native in-process tools, external MCP tools, and UI-only affordances. +3. Rich tool details: structured parameters, stdout/stderr sections for Bash, file diff summaries for edits, browser/page preview metadata, redaction notices, and safe full-output references. +4. Markdown polish: tables, code highlighting, wrap toggle, always reachable copy, link cards for generated URLs, and mobile-safe overflow. +5. Thinking contract: provider capability labels, duration, hidden reasoning copy, and future safe reasoning summaries when Murph exposes them. +6. Durable artifact gallery: a lightweight session artifact rail fed by existing tool output and file metadata, not by extra agent work. +7. Composer polish: environment/status line, upload progress, attachment error recovery, command affordance, and clearer stop state. + +## UI Polish Notes + +- Tool cards should stay collapsed by default for completed tools, and auto-open only for error and blocked states. The current behavior at `chat-ui/src/components/tool-call-card.tsx:164-171` is correct. +- Collapsed cards need better content: `phantom_create_page` should show page title/path, `phantom_preview_page` should show preview status and issue counts, Bash should show the command verb and elapsed time, and external MCP should show server plus tool. +- Expanded cards should avoid one raw wall of text. Structure details into Parameters, Output, Artifacts, Errors, Redactions, and Full output. +- The top/header/sidebar/composer borders are still heavy as a stack. `chat-ui/src/components/app-shell.tsx:149-164` and `chat-ui/src/components/chat-input.tsx:97-100` use the same border tone for structural edges and work objects. Keep object borders for tools/code/cards, soften shell boundaries. +- User messages should feel like compact request objects, not chat bubbles. `chat-ui/src/components/user-message.tsx:8-10` is already calmer than saturated purple, but attachments and metadata need to live with the request. +- Avoid faking thinking. The honest states are: reasoning active, reasoning hidden, provider did not expose reasoning, and reasoned for duration if measured. + +## Testing Plan + +Focused backend: + +- `bun test src/chat/__tests__/sdk-to-wire.test.ts` +- Add a non-success `result` test proving `session.error` does not lead to a committed assistant success row. +- `bun test src/chat/__tests__/writer.test.ts` +- Add attachment commit/load tests and upload-failure behavior tests. +- `bun test src/chat/__tests__/run-timeline.test.ts` +- Add page artifact summary tests once the artifact slice starts. + +Focused chat UI: + +- `cd chat-ui && bun test src/lib/__tests__/chat-store.test.ts` +- Add tests for multi-text-block rendering state, attachment frames, replayed attachment frames, and failed upload no-send behavior. +- Add component tests if the project adds a renderer harness; otherwise verify through production build plus browser. + +Build and static gates: + +- `bun run typecheck` +- `cd chat-ui && bun run typecheck` +- `cd chat-ui && bun run build` +- `bunx biome check` on touched Phantom files +- `git diff --check` + +Live browser verification against Murph: + +1. Start Phantom locally with `PHANTOM_AGENT_RUNTIME=murph` and OpenAI. +2. Send a long-running prompt such as `sleep 10 && pwd`; verify activity appears within one second, elapsed time advances, stop works, and the final answer renders. +3. Send a prompt that forces a tool loop with text before and after the tool; verify both text blocks render. +4. Upload an image, a PDF, and a text file; verify chips render live, survive reload, and preview/open actions work. +5. Create and preview a `/ui/...` page; verify tool cards are collapsed but useful, expanded details show structured parameters/output, and follow-up context still knows the page URL after reload. +6. Trigger or simulate a provider result error; verify no empty assistant success row appears and the run timeline shows the error state. diff --git a/research/chat-experience/phase-10h-pi-thinking-research.md b/research/chat-experience/phase-10h-pi-thinking-research.md new file mode 100644 index 00000000..c401d474 --- /dev/null +++ b/research/chat-experience/phase-10h-pi-thinking-research.md @@ -0,0 +1,590 @@ +# Phase 10H Pi Thinking Research + +## Scope And Constraints + +This report executes the saved prompt at `/Users/truffle/work/phantom-murph-hardening/prompts/phase-10h-pi-thinking-research.md` and incorporates the additional operator direction in `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md`. + +Constraints honored: + +- No application code was edited. +- No files were reverted. +- The only write target used for this task is `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-pi-thinking-research.md`. +- This report treats local source files as evidence and labels design recommendations as inference when they go beyond direct source facts. + +## Executive Summary + +Pi has strong primitives for honest thinking, text streaming, tool calls, tool execution progress, usage, completion, and provider continuity. The strongest reusable layer is not the Pi web UI itself, but the separation it demonstrates: provider adapters preserve protocol details, the agent loop emits a stable event grammar, and UI components render already-normalized facts without mutating the model transcript. + +Phantom should adapt Pi concepts, not import Pi UI wholesale. Pi web UI is built around `mini-lit`, its own `Agent` usage model, and an in-memory artifact tool. Phantom already has a React chat UI, Murph-normalized events, durable sessions, page and file concepts, and product-specific artifacts. The next high-impact slice should make files and artifacts first-class chat surfaces, improve markdown and inspection, and keep thinking honest by rendering only provider-backed reasoning events or explicitly labeled redacted/hidden states. + +The product direction file is aligned with the Pi evidence: built-in Phantom tools should own native pages, artifacts, files, auth-sensitive previews, and session-specific operations; MCP tools should remain external and reusable integrations; UI affordances should inspect, open, copy, filter, expand, retry, and preview state that already exists. UI affordances must not invent tool execution or fabricate provider thinking. + +## Source Inventory + +### Doctrine And Project Contract + +- `/Users/truffle/.claude/AGENTS.md`: Root orchestration doctrine and operating expectations. +- `/Users/truffle/.claude/CLAUDE.md`: Canonical doctrine, communication constraints, verification expectations, and strict professional output rules. +- `/Users/truffle/work/murph/AGENTS.md`: Murph-specific clean-room contract, required reading list, strict TypeScript rule, v1 library-only scope, and safety constraints. +- `/Users/truffle/work/murph/VISION.md`: Murph product and architecture intent. +- `/Users/truffle/work/murph/PROGRESS.md`: Current phase status and already-completed work. +- `/Users/truffle/work/murph/QUALITY-BAR.md`: Verification and completion bar. +- `/Users/truffle/work/murph/ARCHITECTURE.md`: Runtime boundaries and normalized event architecture. +- `/Users/truffle/work/murph/IMPLEMENTATION-PLAN.md`: Planned Murph phase sequencing and scope boundaries. + +Why this matters: the report must keep Murph clean-room, keep Phantom-specific UI outside Murph, and avoid application edits during research. + +### Prompt And Product Direction + +- `/Users/truffle/work/phantom-murph-hardening/prompts/phase-10h-pi-thinking-research.md`: Required deliverables for this report. +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md`: Additional operator direction on files and artifacts, tool ownership, markdown quality, interactive inspection, and honest provider thinking. +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10g-pi-continuity.md`: Prior Pi continuity research and constraints around provider thinking preservation. +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10a-synthesis.md`: Prior synthesis for chat experience direction. +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10c-murph-progress-research.md`: Prior Murph progress research. +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10d-chat-ui-polish-research.md`: Prior Phantom chat polish research. +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10e-progress-ui-implementation-research.md`: Prior progress UI implementation research. + +Why this matters: Phase 10H should extend the existing chat-experience research thread, not restart it or recommend conflicting UI semantics. + +### Pi Agent Runtime + +- `/Users/truffle/work/pi-mono/packages/agent/src/types.ts`: Agent state, messages, tool result shape, tool update callback, and agent events. +- `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts`: Main loop, streaming response handling, tool execution events, transformContext integration, and completion handling. +- `/Users/truffle/work/pi-mono/packages/agent/src/agent.ts`: Agent wrapper, options, thinking-level mapping, event dispatch, state updates, and default LLM conversion. +- `/Users/truffle/work/pi-mono/packages/agent/src/index.ts`: Public exports for the Pi agent package. +- `/Users/truffle/work/pi-mono/packages/agent/src/proxy.ts`: Agent transport/proxy surface. + +Why this matters: Murph already uses Pi as a substrate, so Phantom should receive provider facts through Murph normalization rather than coupling directly to Pi UI internals. + +### Pi AI Types And Providers + +- `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`: `ThinkingLevel`, `ThinkingContent`, `ToolCall`, `Usage`, `AssistantMessageEvent`, and model capability types. +- `/Users/truffle/work/pi-mono/packages/ai/src/stream.ts`: Public simple stream and completion entry points. +- `/Users/truffle/work/pi-mono/packages/ai/src/utils/event-stream.ts`: `AssistantMessageEventStream` lifecycle and `.result()` behavior. +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts`: Cross-model thinking handling, redacted thinking preservation, thought signature dropping, synthetic tool result insertion, and partial assistant skip logic. +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts`: Anthropic thinking display, redacted thinking mapping, signature streaming, tool-use streaming, and usage handling. +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses.ts`: OpenAI Responses reasoning configuration and encrypted reasoning inclusion. +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts`: OpenAI reasoning summary conversion, encrypted reasoning replay, tool-call conversion, and usage mapping. +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-completions.ts`: Completions-compatible reasoning, reasoning details, tool-call streaming, and provider-specific thinking compatibility. +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts`: Google thought signature handling and the distinction between `thought: true` and encrypted signatures. +- `/Users/truffle/work/pi-mono/packages/ai/src/models.ts`: Local model capability metadata including reasoning support and xhigh support. + +Why this matters: these files define what is protocol-backed thinking, what is continuity metadata, and what is safe to surface in UI. + +### Pi Web UI And Artifacts + +- `/Users/truffle/work/pi-mono/packages/web-ui/README.md`: Pi web UI purpose, public components, chat panel usage, attachments, artifacts, storage, and event list. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/ChatPanel.ts`: High-level chat panel and artifact panel integration. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/AgentInterface.ts`: Event subscription, streaming message container hookup, usage stats, attachments, model and thinking selectors. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/MessageList.ts`: Message grouping, tool result pairing, and inline assistant rendering. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/Messages.ts`: Assistant message rendering for text, thinking, tool calls, tool results, errors, and usage. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/StreamingMessageContainer.ts`: Request-animation-frame batching and streaming assistant rendering. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/ThinkingBlock.ts`: Collapsible thinking display with streaming state. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderer-registry.ts`: Tool renderer registry, status headers, disclosure, and default renderer selection. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderers/DefaultRenderer.ts`: Generic tool-call card with params and output formatting. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts.ts`: In-memory artifact tool, preview panel, tabs, and supported artifact types. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts-tool-renderer.ts`: Artifact tool-call renderer with file pills, diffs, code blocks, console logs, and details. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/ArtifactElement.ts`: Artifact preview base class and header-button abstraction. +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/ArtifactPill.ts`: Clickable artifact pill surface. + +Why this matters: Pi web UI contains useful rendering patterns, but Phantom should adapt them into its React, session, and artifact model. + +### Pi Mom CLI-Style Surfaces + +- `/Users/truffle/work/pi-mono/packages/mom/src/log.ts`: CLI logging for user input, tool start/success/error, response start, thinking, response, downloads, stop, warnings, errors, and usage summary. +- `/Users/truffle/work/pi-mono/packages/mom/src/agent.ts`: Mom agent wiring around Pi coding agent and tool events. +- `/Users/truffle/work/pi-mono/packages/mom/src/store.ts`: JSONL Slack log persistence and downloaded attachment storage. +- `/Users/truffle/work/pi-mono/packages/mom/src/events.ts`: File-driven event scheduling. + +Why this matters: Mom is not a Phantom chat UI, but it demonstrates concise progress logging, output truncation, usage summaries, and file download persistence. + +### Murph Runtime And Normalization + +- `/Users/truffle/work/murph/packages/core/src/types/message.ts`: Murph content blocks, tool progress message shape, compact state, status messages, API retry messages, and file persisted messages. +- `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts`: Normalized event grammar for text, thinking, redacted thinking, tool calls, tool execution, tool progress, session state, compaction, rate limits, retries, prompts, subagents, hooks, permissions, notifications, and errors. +- `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts`: Pi-to-Murph translation for thinking, redacted thinking, tool calls, images, tool results, usage, and tool execution events. +- `/Users/truffle/work/murph/packages/core/src/query/query.ts`: SDK stream mapping, runtime event mapping, tool progress preview limits, secret redaction, truncation, and full reference support. +- `/Users/truffle/work/murph/packages/core/src/substrate/pi-harness.ts`: Murph Pi harness, `transformContext`, tool hooks, thinking level, thinking budgets, and normalized event forwarding. +- `/Users/truffle/work/murph/packages/core/src/query/options.ts`: Murph thinking option normalization and Pi thinking-level mapping. +- `/Users/truffle/work/murph/packages/anthropic-sdk-shim/src/index.ts`: Public shim export for `MurphToolProgressMessage`. + +Why this matters: Phantom should read the Murph-normalized event stream and avoid reaching around Murph into provider-specific or Pi-specific internals. + +### Phantom Chat Backend And UI + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/types.ts`: Phantom wire protocol for session, message, thinking, tool, and error frames. +- `/Users/truffle/work/phantom-murph-hardening/src/chat/types-tool.ts`: Phantom wire tool-event shapes. +- `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts`: Assistant and stream-event conversion into Phantom wire frames, including thinking and tool-use handling. +- `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts`: System/result/user event conversion, compaction/rate/subagent/tool-progress mapping, and safe tool error handling. +- `/Users/truffle/work/phantom-murph-hardening/src/chat/writer.ts`: Chat stream writer, durable message commit, final assistant persistence, and timeline persistence. +- `/Users/truffle/work/phantom-murph-hardening/src/chat/message-builder.ts`: User attachment conversion for images, PDFs, documents, and text. +- `/Users/truffle/work/phantom-murph-hardening/src/chat/message-store.ts`: Durable chat message storage and `content_json` handling. +- `/Users/truffle/work/phantom-murph-hardening/src/agent/chat-query.ts`: Phantom runtime query setup with partial messages, progress summaries, prompt suggestions, thinking config, effort, and `transformContext`. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-types.ts`: Frontend message, thinking, tool, run activity, and timeline types. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts`: Wire-frame reducer for messages, thinking blocks, tools, statuses, compaction, rate limits, MCP connection, and subagents. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-dispatch-tools.ts`: Tool-call state machine and placeholder tool handling. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-activity.ts`: Run-activity summaries, active run timeline, compaction/rate/MCP/subagent/tool activity. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx`: Assistant rendering for thinking blocks, tool cards, markdown text, streaming indicators, and usage. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/thinking-block.tsx`: Phantom reasoning block labels and redacted-state display. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx`: Tool cards, tool icons, parameter display, output display, error and blocked states, full reference links, and redaction. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/run-activity-row.tsx`: Run activity row with status, facts, subagents, and tool cards. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/markdown.tsx`: ReactMarkdown, GFM, sanitize, custom links, and code block integration. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/code-block.tsx`: Code block header, language label, wrapping, and copy action. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/message-list.tsx`: Message grouping, run timeline placement, current run activity row, and streaming accessibility text. +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/hooks/use-chat.ts`: SSE consumption, resume, initial history loading, and durable message parsing. + +Why this matters: Phantom already has most of the scaffolding for honest progress and thinking, but file/artifact inspection, markdown polish, and some event coverage remain the highest-impact gaps. + +## Pi Representation Model + +### Thinking + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/ai/src/types.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts` +- `/Users/truffle/work/pi-mono/packages/agent/src/agent.ts` +- `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts` + +Direct source facts: + +- Pi AI defines `ThinkingLevel` as reasoning levels and Pi agent-core adds an `off` state for agent configuration in `/Users/truffle/work/pi-mono/packages/agent/src/types.ts` and `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`. +- Pi AI represents thinking content as a content block with `type: "thinking"`, `thinking`, optional `thinkingSignature`, and optional `redacted` in `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`. +- Pi streams thinking through `AssistantMessageEvent` variants: `thinking_start`, `thinking_delta`, and `thinking_end` in `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`. +- Pi agent maps `state.thinkingLevel` into provider reasoning settings unless thinking is `off` in `/Users/truffle/work/pi-mono/packages/agent/src/agent.ts`. +- Anthropic provider handling supports summarized or omitted thinking display, maps redacted thinking into a redacted thinking block, and preserves signature data in `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts`. +- OpenAI Responses handling requests encrypted reasoning content when reasoning is enabled and maps reasoning summaries into Pi thinking events in `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses.ts` and `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts`. +- Google handling distinguishes visible thought parts from encrypted thought signatures. The Google source states that `thought: true` is the marker for thinking and that `thoughtSignature` can appear on any part type in `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts`. +- Cross-model transcript transformation preserves redacted thinking only when safe for same-model continuity, converts non-empty cross-model thinking to text, skips empty thinking, and removes tool-call thought signatures when crossing models in `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts`. +- Pi transform code skips errored or aborted assistant messages because partial reasoning or incomplete tool calls can create API errors in `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts`. + +Inference: + +- Phantom should treat provider thinking as protocol data, not as a generic UI spinner. If Phantom did not receive a Murph/Pi thinking event, it should show working, waiting, streaming, tool running, or status text instead of claiming the model is thinking. +- Phantom should not display `thinkingSignature`, encrypted reasoning content, or Google `thoughtSignature` as user-readable content. These are continuity artifacts according to the provider adapter source files listed above. +- Phantom's current `thinking-block.tsx` choice to hide the thinking text and show labels such as "Thought", "Thought for Xs", or "Reasoning hidden" is safer than rendering raw provider reasoning by default. Evidence for the current UI behavior is `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/thinking-block.tsx`. + +### Streaming Text + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/ai/src/types.ts` +- `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/StreamingMessageContainer.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/Messages.ts` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts` + +Direct source facts: + +- Pi streams assistant text through `text_start`, `text_delta`, and `text_end` events in `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`. +- Pi agent loop forwards streaming assistant events as `message_update` events in `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts`. +- Pi web UI batches streaming message updates with `requestAnimationFrame` in `/Users/truffle/work/pi-mono/packages/web-ui/src/components/StreamingMessageContainer.ts`. +- Phantom backend maps assistant stream text into `message.text_delta` and related frames in `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts`. +- Phantom frontend appends text deltas into assistant content in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts`. + +Inference: + +- Phantom already has the correct data path for streaming text. The main gap is rendering quality and persistence shape, not transport. +- Phantom should inspect whether multiple text blocks can arrive in a single assistant message. The current assistant renderer uses the first text block from `message.content.find(...)` in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx`, so later text blocks could be hidden if they occur. + +### Tool Calls And Tool Progress + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/ai/src/types.ts` +- `/Users/truffle/work/pi-mono/packages/agent/src/types.ts` +- `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderer-registry.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderers/DefaultRenderer.ts` +- `/Users/truffle/work/murph/packages/core/src/types/message.ts` +- `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts` +- `/Users/truffle/work/murph/packages/core/src/query/query.ts` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-dispatch-tools.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` + +Direct source facts: + +- Pi tool calls are content blocks with `type: "toolCall"`, `id`, `name`, `arguments`, and optional `thoughtSignature` in `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`. +- Pi tools accept `onUpdate` callbacks during execution in `/Users/truffle/work/pi-mono/packages/agent/src/types.ts`. +- Pi agent loop emits `tool_execution_start`, `tool_execution_update`, and `tool_execution_end` events around execution in `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts`. +- Pi web UI renders tools through a renderer registry and default renderer in `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderer-registry.ts` and `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderers/DefaultRenderer.ts`. +- Murph normalizes Pi and runtime tool activity into `tool_progress` and `tool_execution_*` events in `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts` and `/Users/truffle/work/murph/packages/core/src/query/query.ts`. +- Murph tool progress includes phase, elapsed time, duration, input preview, output preview, truncation flag, safe display flag, redactions, and full reference in `/Users/truffle/work/murph/packages/core/src/types/message.ts`. +- Phantom converts tool progress into `tool.running` and `tool.result` frames in `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts`. +- Phantom frontend has a tool state machine that handles pending, input streaming, running, result, error, blocked, and aborted states in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-dispatch-tools.ts`. +- Phantom tool cards already show parameters, redacted output, block reasons, errors, full output reference, and a 12000-character display limit in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx`. + +Inference: + +- Phantom's progress and tool-card foundation is sound after Phase 10E. The next improvement should specialize built-in Phantom tools, MCP tools, file outputs, generated pages, and full references into richer inspection surfaces rather than adding more generic status rows. +- Tool output is not equivalent to artifact state. A tool output may mention a file, page, public URL, or full reference, but Phantom needs explicit artifact extraction or explicit frames to make those surfaces first-class. + +### Usage And Completion + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/ai/src/types.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/utils/event-stream.ts` +- `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/AgentInterface.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/Messages.ts` +- `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx` + +Direct source facts: + +- Pi `Usage` includes input, output, cache read, cache write, total tokens, and cost in `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`. +- Pi stream completion uses `done` and `error` assistant events in `/Users/truffle/work/pi-mono/packages/ai/src/types.ts` and `/Users/truffle/work/pi-mono/packages/ai/src/utils/event-stream.ts`. +- Pi agent loop emits message and turn lifecycle events in `/Users/truffle/work/pi-mono/packages/agent/src/agent-loop.ts`. +- Pi web UI displays usage when available in `/Users/truffle/work/pi-mono/packages/web-ui/src/components/Messages.ts` and session stats in `/Users/truffle/work/pi-mono/packages/web-ui/src/components/AgentInterface.ts`. +- Murph translates Pi usage into normalized usage in `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts`. +- Phantom assistant messages render cost and token usage when `message.usage` exists in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx`. + +Inference: + +- Phantom can continue to show usage at assistant-message level, but should avoid making usage the primary sign of completion. Session lifecycle, stream stop, and tool completion events are better completion signals because usage can be absent or provider-dependent. + +## Pi Rendering Patterns + +### Web UI Pattern To Adapt + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/web-ui/README.md` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/ThinkingBlock.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/StreamingMessageContainer.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/MessageList.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/components/Messages.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderer-registry.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts-tool-renderer.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/ArtifactPill.ts` + +Direct source facts: + +- Pi web UI provides a complete chat panel, message list, streaming container, thinking block, tool renderers, attachments, and artifacts in `/Users/truffle/work/pi-mono/packages/web-ui/README.md`. +- Pi web UI renders thinking as a collapsible block in `/Users/truffle/work/pi-mono/packages/web-ui/src/components/ThinkingBlock.ts`. +- Pi web UI pairs tool calls with tool results and skips standalone tool-result messages in `/Users/truffle/work/pi-mono/packages/web-ui/src/components/MessageList.ts`. +- Pi web UI has a tool renderer registry with headers, status treatment, disclosure, and fallback rendering in `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderer-registry.ts`. +- Pi web UI includes an artifacts tool, artifact panel, artifact pills, tabs, preview types, diffs, code blocks, logs, and console surfaces in `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts.ts`, `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts-tool-renderer.ts`, and `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/ArtifactPill.ts`. + +Inference: + +- Phantom should adapt these patterns: paired inline tool results, collapsible tool details, streaming update batching, honest thinking disclosure, artifact pills, artifact preview tabs, and file/page side-panel inspection. +- Phantom should not import Pi web UI as-is. The Pi implementation is `mini-lit`, has its own chat surface and artifact storage pattern, and does not match Phantom's React components, durable sessions, or native Phantom page/artifact model. Evidence for Phantom's React UI is in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx`, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts`, and `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/hooks/use-chat.ts`. + +### CLI-Style Pattern To Adapt Selectively + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/mom/src/log.ts` +- `/Users/truffle/work/pi-mono/packages/mom/src/store.ts` +- `/Users/truffle/work/pi-mono/packages/mom/src/agent.ts` + +Direct source facts: + +- Mom logs user input, tool lifecycle, response start, thinking, response text, downloads, stop, warnings, errors, and usage in `/Users/truffle/work/pi-mono/packages/mom/src/log.ts`. +- Mom truncates long output and adds context to tool logs in `/Users/truffle/work/pi-mono/packages/mom/src/log.ts`. +- Mom persists conversation events and downloaded attachments to local workspace files in `/Users/truffle/work/pi-mono/packages/mom/src/store.ts`. + +Inference: + +- Phantom can adapt Mom's concise temporal log shape for compact activity summaries and run timelines. +- Phantom should not adapt Mom as a UI implementation. It is a CLI-style surface around Slack/mom workflows, not a browser chat UI. Evidence for that workflow is `/Users/truffle/work/pi-mono/packages/mom/src/agent.ts`. + +## Ownership Split + +### What Murph And Pi Should Own + +Evidence: + +- `/Users/truffle/work/murph/AGENTS.md` +- `/Users/truffle/work/murph/ARCHITECTURE.md` +- `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts` +- `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts` +- `/Users/truffle/work/murph/packages/core/src/substrate/pi-harness.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts` + +Direct source facts: + +- Murph is a clean-room TypeScript agent runtime and should build generic runtime behavior rather than Phantom-specific shortcuts according to `/Users/truffle/work/murph/AGENTS.md`. +- Murph already owns normalized event grammar and Pi translation in `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts` and `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts`. +- Murph's Pi harness owns Pi substrate integration, tool hooks, thinking levels, thinking budgets, and normalized event forwarding in `/Users/truffle/work/murph/packages/core/src/substrate/pi-harness.ts`. +- Pi provider adapters own provider-specific reasoning and continuity details in `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts`, `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts`, and `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts`. + +Inference: + +- Murph and Pi should own provider transport, model capability handling, transcript transformation, thinking budgets, encrypted or redacted reasoning continuity, normalized stream events, tool execution lifecycle, tool progress envelopes, usage, retries, compaction, and cross-model safety. +- Murph should expose facts. It should not own Phantom page previews, artifact side panels, chat-specific visual affordances, or product-specific file browsing. + +### What Phantom Built-In Tools Should Own + +Evidence: + +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/message-builder.ts` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/writer.ts` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/message-store.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/hooks/use-chat.ts` + +Direct source facts: + +- The product direction file explicitly asks for files and artifacts as first-class UI surfaces, clear built-in versus MCP versus UI ownership, markdown quality, interactive inspection, and no fake provider thinking in `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md`. +- Phantom already transforms user attachments into message content in `/Users/truffle/work/phantom-murph-hardening/src/chat/message-builder.ts`. +- Phantom persists final assistant content and timelines in `/Users/truffle/work/phantom-murph-hardening/src/chat/writer.ts` and `/Users/truffle/work/phantom-murph-hardening/src/chat/message-store.ts`. +- Phantom tool cards already understand local tool names such as Read, Write, Edit, Bash, Glob, Grep, WebSearch, and WebFetch in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx`. + +Inference: + +- Phantom built-in tools should own Phantom-native operations that require product state, workspace state, auth, session identity, and safe preview semantics: page creation, page preview, generated file registration, attachment registration, durable artifact metadata, safe full-output references, and Phantom-owned file browsing. +- Phantom built-ins should produce explicit artifact metadata when they create or modify files or pages. UI should not have to scrape arbitrary prose when a built-in tool already knows the artifact identity. + +### What MCP Tools Should Own + +Evidence: + +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +- `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-types.ts` + +Direct source facts: + +- Phantom wire and UI types track MCP connection state and MCP server metadata in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-types.ts`. +- Phantom backend marks tools as MCP based on tool naming conventions in `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts`. +- Murph normalized events include MCP-adjacent generic tool execution events rather than Phantom-specific tool semantics in `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts`. + +Inference: + +- MCP tools should own external, reusable integrations such as third-party systems, browser automation, email, calendar, source control, search, and other portable capabilities. +- MCP tools should not be used as a substitute for Phantom-native file/page/session UI when the product already owns the underlying state and authorization model. + +### What UI Affordances Should Own + +Evidence: + +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/markdown.tsx` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/code-block.tsx` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/run-activity-row.tsx` + +Direct source facts: + +- Phantom UI already renders expandable tool cards, copied code blocks, markdown, and run activity rows in the files listed above. + +Inference: + +- UI affordances should open, copy, preview, filter, expand, collapse, retry, and inspect already-produced state. +- UI affordances should not execute hidden tools, invent artifacts, infer provider thinking, or mutate transcript content sent back to providers. + +## Phantom Recommendations Ordered By Impact + +### 1. Make Files And Artifacts First-Class Chat Surfaces + +Evidence: + +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts-tool-renderer.ts` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/ArtifactPill.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` +- `/Users/truffle/work/murph/packages/core/src/types/message.ts` + +Inference: + +- Add an artifact extraction and rendering layer in Phantom UI that treats created files, edited files, generated pages, attachments, public URLs, and `full_ref` outputs as inspectable objects. +- Render compact artifact pills inline in tool cards and assistant messages. +- Add an artifact inspector side panel or drawer with type-specific previews: text, markdown, code, image, PDF, HTML/page preview, diff, logs, and metadata. +- Preserve the tool card as the execution record, but let the artifact inspector become the place where users inspect durable outputs. +- Keep full references safe. A `full_ref` should be an opaque reference until a server endpoint validates scope, path, auth, and display safety. + +### 2. Keep Thinking Honest And Provider-Backed + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/thinking-block.tsx` +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` + +Inference: + +- Render thinking only when Murph/Phantom receives actual thinking events. +- For redacted or hidden reasoning, show explicit labels such as "Reasoning hidden" and optionally duration. Do not show fake summaries. +- For providers without reasoning events, use status language such as "Working", "Calling tools", "Reading files", or "Waiting for model" instead of "Thinking". +- Never expose `thinkingSignature`, encrypted reasoning content, Google `thoughtSignature`, or provider replay payloads in UI. +- Avoid treating tool progress, compaction, retries, or MCP connection events as model thinking. + +### 3. Clarify Built-In Tool, MCP Tool, And UI Labels + +Evidence: + +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-types.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` + +Inference: + +- Add visible but compact metadata for tool origin: Phantom built-in, MCP server, or local/runtime tool. +- For built-in tools, prefer product words such as "Created page", "Updated file", "Read workspace", or "Generated preview". +- For MCP tools, show the server name and tool name, because the current backend already derives MCP metadata from tool naming conventions in `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts`. +- For UI affordances, do not present actions such as opening, copying, expanding, previewing, or filtering as tool calls. They are inspection controls. + +### 4. Upgrade Markdown Rendering Quality + +Evidence: + +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/markdown.tsx` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/code-block.tsx` +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` + +Inference: + +- Keep `remark-gfm` and `rehype-sanitize`, but add polished table, task-list, blockquote, ordered-list, unordered-list, inline-code, pre/code, and link treatments. +- Add syntax highlighting or language-aware styling to code blocks while preserving copy actions. +- Detect safe local artifact references and generated page links, then render them as artifact pills or preview links. +- Keep raw HTML sanitized. Any custom renderer for links, images, or code must preserve sanitize guarantees. + +### 5. Fill Event Coverage Gaps Before Building More UI States + +Evidence: + +- `/Users/truffle/work/murph/packages/core/src/types/message.ts` +- `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts` +- `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-types.ts` + +Inference: + +- Phantom currently maps compaction, rate limits, subagents, and tool progress, but should audit additional Murph events such as `api_retry`, `files_persisted`, `tool_use_summary`, `auth_status`, `local_command_output`, hook progress, plugin install, session state, notification, memory recall, and mirror errors before adding separate bespoke UI states. +- Event coverage should stay factual. If Murph does not emit an event, Phantom UI should not synthesize a provider-like state. + +## Risks And Anti-Patterns + +### Protocol And Provider Risks + +Evidence: + +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts` +- `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts` +- `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts` + +Risks: + +- Displaying encrypted reasoning payloads, `thinkingSignature`, or `thoughtSignature` as user-readable thinking would leak continuity metadata and misrepresent provider semantics. +- Treating Google `thoughtSignature` as proof of visible thinking would contradict the adapter comment in `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts`. +- Rewriting provider transcript content for UI display can break tool-call protocol and cross-model continuity. Pi transform code already has explicit safeguards in `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts`. +- Rendering raw provider reasoning by default can expose unsafe or private reasoning material. The product direction file asks to avoid fake provider thinking, and the provider files show that visible reasoning semantics vary by provider. + +### Tool And Artifact Risks + +Evidence: + +- `/Users/truffle/work/murph/packages/core/src/query/query.ts` +- `/Users/truffle/work/murph/packages/core/src/types/message.ts` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` +- `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts.ts` + +Risks: + +- Full output references can become path traversal or data exposure risks if the UI dereferences them without server-side validation. Murph exposes `full_ref` as part of tool progress in `/Users/truffle/work/murph/packages/core/src/types/message.ts`. +- Tool inputs and outputs can contain secrets. Murph redaction exists in `/Users/truffle/work/murph/packages/core/src/query/query.ts`, and Phantom tool cards have frontend redaction in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx`; neither should be weakened by artifact previews. +- Importing Pi's in-memory artifact tool directly would duplicate Phantom storage and authority. Pi's artifact implementation is in `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts.ts`, while Phantom durability is in `/Users/truffle/work/phantom-murph-hardening/src/chat/writer.ts` and `/Users/truffle/work/phantom-murph-hardening/src/chat/message-store.ts`. +- UI-only artifact messages must not leak back into model context. Pi web UI has conversion logic to filter artifact messages in `/Users/truffle/work/pi-mono/packages/web-ui/src/components/Messages.ts`. + +### UX And Product Risks + +Evidence: + +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/markdown.tsx` +- `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/message-list.tsx` +- `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` + +Risks: + +- A chat UI that only shows transient tool cards leaves generated files and pages feeling like log output instead of first-class product artifacts. +- The current assistant renderer appears to render only the first text block from an assistant message in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx`. If multiple text blocks can arrive, content may be hidden. +- Markdown that is technically correct but visually weak can make good model output feel untrustworthy. Current markdown support is grounded in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/markdown.tsx`. +- Confusing UI affordances with tools can make the user think opening or copying a file changed runtime state. The product direction file asks for clear separation in `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md`. + +## Concrete Acceptance Criteria For Next Builder Slice + +Inference: the next builder slice should be a Phantom chat UI slice focused on first-class artifacts, honest thinking display, and markdown polish. It should not require Murph runtime changes unless the event coverage audit finds a missing normalized event that already exists in Murph but is not wired into Phantom. + +Acceptance criteria: + +1. Artifact extraction + - Phantom frontend derives artifact candidates from existing wire frames, tool outputs, tool names, `full_ref`, generated page URLs, and attachment metadata. + - Artifact candidates preserve source message id, tool call id when present, display label, type, origin, safe preview status, and raw reference. + - Extraction does not mutate model transcript content. + +2. Artifact rendering + - Tool cards and assistant markdown can render compact artifact pills for recognized files, pages, URLs, and full references. + - Clicking an artifact pill opens an inspector panel or drawer without starting a new model or tool call. + - Inspector supports at least text, markdown, code, image, PDF, generated page URL, and opaque full reference metadata. + - Full reference preview is disabled unless a server endpoint validates it as safe to display. + +3. Thinking honesty + - Thinking UI renders only from received thinking frames. + - Redacted thinking displays a redacted or hidden label and does not expose signatures or encrypted content. + - No provider without thinking frames is shown as "thinking"; status language uses runtime facts such as tool running, compaction, retry, or streaming text. + - Tests cover text thinking, redacted thinking, and no-thinking provider behavior. + +4. Tool-origin clarity + - Tool cards visually distinguish Phantom built-in, MCP, and generic runtime tools using existing metadata or clearly documented heuristics. + - MCP cards show server name when available. + - UI-only actions such as preview, copy, open, expand, collapse, filter, and retry are not displayed as tool calls. + +5. Markdown polish + - Tables, lists, blockquotes, inline code, fenced code, links, and task lists render with polished spacing and wrapping. + - Code blocks keep copy behavior and add language-aware presentation. + - Sanitization remains enabled. + - Safe artifact links in markdown render as links or artifact pills without enabling raw HTML execution. + +6. Persistence and replay + - Reloaded chat history shows durable assistant text, historical run timeline, and artifact references when their source data is persisted. + - If artifact details are transient and unavailable after reload, the UI clearly shows metadata instead of a broken preview. + - Existing session resume behavior in `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/hooks/use-chat.ts` remains intact. + +7. Verification + - Unit tests cover reducer/extractor behavior for thinking, tool outputs, artifact candidates, and markdown link rendering. + - Component tests or browser checks cover artifact inspector open/close, tool-card expansion, markdown tables, code blocks, and redacted thinking. + - No application code outside the approved builder slice is changed. + - No explicit `any`, no `@ts-ignore`, and no hidden type escapes are introduced. + +## Highest-Signal Findings + +1. Pi already has honest thinking primitives, including redacted/encrypted continuity handling, but those primitives are provider-specific. Phantom should render only Murph/Pi-backed thinking frames and should never display signatures or encrypted reasoning payloads. Evidence: `/Users/truffle/work/pi-mono/packages/ai/src/types.ts`, `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts`, `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts`, `/Users/truffle/work/pi-mono/packages/ai/src/providers/google-shared.ts`. + +2. Pi web UI is a valuable pattern library, not a drop-in dependency for Phantom. Adapt the ideas of paired tool results, collapsible thinking, streaming batching, renderer registry, artifact pills, and artifact inspector, but implement them in Phantom's React and durable session model. Evidence: `/Users/truffle/work/pi-mono/packages/web-ui/src/components/Messages.ts`, `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/renderer-registry.ts`, `/Users/truffle/work/pi-mono/packages/web-ui/src/tools/artifacts/artifacts.ts`, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts`. + +3. The next product jump is first-class files and artifacts. Current Phantom tool cards are useful execution records, but generated pages, edited files, attachments, URLs, and `full_ref` outputs need artifact pills and an inspector so outputs are not trapped inside log text. Evidence: `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md`, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx`, `/Users/truffle/work/murph/packages/core/src/types/message.ts`. + +4. Ownership should stay split: Pi and Murph own provider protocol, transcript safety, thinking continuity, normalized events, usage, and tool progress; Phantom owns product-specific built-ins, pages, files, artifacts, previews, and UI inspection; MCP owns external reusable integrations. Evidence: `/Users/truffle/work/murph/packages/core/src/substrate/pi-harness.ts`, `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts`, `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md`. + +5. Markdown quality and replay durability are now part of trust. Phantom has GFM and sanitize, but needs stronger table/code/link/artifact rendering, and the current assistant renderer should be checked for multi-text-block messages because it selects only the first text block. Evidence: `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/markdown.tsx`, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/code-block.tsx`, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx`. + +## Self Review + +- Every direct factual claim in this report is tied to a local source path in the same paragraph or bullet group. +- Recommendations and product choices are labeled as inference. +- No external source was needed because the requested evidence exists in local Pi, Murph, and Phantom source files. +- No application code was edited. +- The only file written for this task is `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-pi-thinking-research.md`. diff --git a/research/chat-experience/phase-10h-product-direction.md b/research/chat-experience/phase-10h-product-direction.md new file mode 100644 index 00000000..6fc5e25c --- /dev/null +++ b/research/chat-experience/phase-10h-product-direction.md @@ -0,0 +1,79 @@ +# Phase 10H Product Direction: Best-in-Class Chat + +Date: 2026-05-01 + +## Operator Direction + +Cheema wants Phantom's chat to feel like a product people choose over other +agent surfaces, not merely a working transcript. The experience needs obsessive +attention to detail: + +- Long-running tasks should never feel dead. The user should see meaningful + live activity, tool usage, compaction, waiting, recovery, and completion. +- Tool cards should be useful when collapsed and rich when expanded. +- Thinking/progress should be honest across providers. Do not fake private + chain-of-thought. Show redacted/private reasoning state, summaries, timing, + usage, or provider-supported reasoning signals when those signals really + exist. +- Files and artifacts should feel first-class. If the agent creates or reads a + file, the UI should make it inspectable when safe, with links, previews, and + metadata rather than raw walls of text. +- Markdown should feel polished and predictable. Code, tables, links, lists, + citations, and generated URLs should render clearly. +- Interaction details matter: borders, spacing, sticky composer behavior, + disabled/loading states, command affordances, copy actions, reveal controls, + and error recovery should all feel intentional. + +## Tools and Capability Model + +Every new capability should ask where it belongs: + +- **Murph/Pi** owns provider transport, thinking levels, model metadata, + overflow/compaction primitives, core agent event semantics, and tool execution + seams. +- **Phantom built-in or CLI tools** are best for core app capabilities that must + be reliable, low-latency, and tightly integrated with Phantom state, files, + pages, sessions, and auth. +- **MCP tools** are best for external integrations or capabilities that benefit + from a standard tool server boundary, independent lifecycle, or reuse across + agents. +- **UI affordances** are best for browsing, opening, copying, previewing, + filtering, expanding, retrying, and inspecting data already produced by the + agent. Do not force the agent to call a tool when the browser can safely show + an existing artifact. + +Default bias: + +1. Reuse Pi or Murph if the primitive already exists. +2. Use Phantom built-ins for Phantom-native files, pages, artifacts, sessions, + previews, and chat ergonomics. +3. Use MCP for external or reusable integrations. +4. Use UI controls for inspection and interaction with already-known state. + +## Product Bar + +The target is not novelty. The target is clarity, trust, and flow: + +- The user always knows whether the agent is thinking, using a tool, waiting, + compacting, retrying, done, blocked, or errored. +- The UI shows enough detail to build trust without drowning the user. +- Expanded tool detail should be structured: parameters, output, previews, + links, generated files, and safe full-output references. +- Hidden or redacted provider thinking should be labeled honestly. +- Visual language should use product icons and clear labels. Emojis are not a + default system primitive for professional surfaces. +- All visible text must fit on desktop and mobile. Controls should not jump + around during streaming. + +## Immediate Research Questions + +1. What Pi/Pi Code already provides for thinking, progress, CLI rendering, file + display, and tool activity? +2. Which Phantom chat states are currently missing, misleading, or visually + underpowered? +3. Which capabilities should become Phantom built-in tools versus MCP tools + versus UI-only affordances? +4. What can OpenAI, Anthropic, and ZAI truthfully expose as thinking/reasoning + through Pi/Murph today? +5. What is the smallest next builder slice that improves the live experience + materially while remaining testable? diff --git a/research/chat-experience/phase-10h-provider-thinking-research.md b/research/chat-experience/phase-10h-provider-thinking-research.md new file mode 100644 index 00000000..28dc1c5f --- /dev/null +++ b/research/chat-experience/phase-10h-provider-thinking-research.md @@ -0,0 +1,353 @@ +# Phase 10H Provider Thinking Research + +Date: 2026-05-01 + +## Provider Capability Matrix Headline + +Phantom can honestly show live reasoning state across OpenAI, Anthropic, and ZAI, but it should only show reasoning text when Murph/Pi can prove it is a provider-supported summary or safe display text. Redacted, encrypted, private, or unknown thinking must be hidden or labeled as private. Separate thinking token counts are not available to Phantom today because Pi, Murph, and the Phantom wire protocol drop provider reasoning token fields. + +## Executive Findings + +Phantom's current UI posture is conservative and mostly correct: it displays "Thinking...", "Thought", or "Reasoning hidden" and does not render thinking text in `ThinkingBlock`. However, the browser store still accumulates `message.thinking_delta` text in memory, so private reasoning can still cross the Phantom wire and land client-side. + +OpenAI, Anthropic, and ZAI do not expose the same product object. OpenAI Responses exposes optional reasoning summaries, encrypted reasoning items, and upstream reasoning token counts. Anthropic Messages exposes thinking blocks, signatures, summarized or omitted display modes in Pi, and redacted thinking blocks. ZAI exposes `reasoning_content` for GLM thinking. That ZAI signal is best treated as private reasoning, not a user-facing summary. + +The missing product primitive is not another visual treatment. It is an event-contract field that distinguishes `summary`, `display_text`, `private`, `redacted`, `encrypted`, and `unknown` thinking. Without that field, Phantom should not render thinking text verbatim, even when the provider stream calls it "thinking". + +The added product direction is the right bar: never fake private chain-of-thought, never let long-running tasks feel dead, and map provider signals into honest UI states. Murph/Pi should own provider semantics and thinking visibility. Phantom should render known states, safe summaries, timing, usage, tools, compaction, retries, blocked states, and errors. + +## Source Map + +Primary local files reviewed: + +- Murph normalized event and usage model: `/Users/truffle/work/murph/packages/core/src/events/normalized-event.ts` +- Murph Pi translator: `/Users/truffle/work/murph/packages/core/src/events/translator-pi.ts` +- Murph message and usage types: `/Users/truffle/work/murph/packages/core/src/types/message.ts` +- Murph harness thinking options: `/Users/truffle/work/murph/packages/core/src/substrate/harness.ts` +- Murph query option normalization: `/Users/truffle/work/murph/packages/core/src/query/options.ts` +- Murph query event mappers: `/Users/truffle/work/murph/packages/core/src/query/query.ts` +- Murph built-in model capability records: `/Users/truffle/work/murph/packages/core/src/providers/models.ts` +- Murph Pi adapter: `/Users/truffle/work/murph/packages/core/src/substrate/pi-adapter.ts` +- Pi shared types: `/Users/truffle/work/pi-mono/packages/ai/src/types.ts` +- Pi OpenAI Responses provider: `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses.ts` +- Pi OpenAI Responses stream processor: `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-responses-shared.ts` +- Pi Anthropic provider: `/Users/truffle/work/pi-mono/packages/ai/src/providers/anthropic.ts` +- Pi OpenAI-compatible completions provider: `/Users/truffle/work/pi-mono/packages/ai/src/providers/openai-completions.ts` +- Pi message transformer: `/Users/truffle/work/pi-mono/packages/ai/src/providers/transform-messages.ts` +- Phantom wire types and translators: `/Users/truffle/work/phantom-murph-hardening/src/chat/types.ts`, `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire.ts`, `/Users/truffle/work/phantom-murph-hardening/src/chat/sdk-to-wire-handlers.ts` +- Phantom durable timeline: `/Users/truffle/work/phantom-murph-hardening/src/chat/run-timeline.ts` +- Phantom UI state and thinking component: `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-types.ts`, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts`, `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/thinking-block.tsx` +- Product direction: `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` + +Official provider docs checked: + +- OpenAI reasoning guide: https://developers.openai.com/api/docs/guides/reasoning +- OpenAI Responses API reference: https://developers.openai.com/api/reference/resources/responses/methods/create +- Anthropic extended thinking guide: https://platform.claude.com/docs/en/build-with-claude/extended-thinking +- ZAI chat completion API: https://docs.z.ai/api-reference/llm/chat-completion +- ZAI thinking mode guide: https://docs.z.ai/guides/capabilities/thinking-mode +- ZAI deep thinking guide: https://docs.z.ai/guides/capabilities/thinking +- ZAI tool streaming guide: https://docs.z.ai/guides/capabilities/stream-tool + +## Event Taxonomy + +### Thinking + +Pi has `thinking_start`, `thinking_delta`, and `thinking_end` stream events in `AssistantMessageEvent`. `ThinkingContent` has `thinking`, optional `thinkingSignature`, and optional `redacted`. + +Murph normalizes Pi thinking into `thinking_start`, `thinking_delta`, and `thinking_end`. `NormalizedUsage` only has input, output, cache read, cache write, total, and cost. It does not have `reasoningTokens` or `thinkingTokens`. + +Murph SDK-style stream mapping converts normalized thinking events into `stream_event` content block start, delta, and stop with `content_block.type: "thinking"`. + +Phantom converts SDK stream events to `message.thinking_start`, `message.thinking_delta`, and `message.thinking_end`. `ThinkingStartFrame` includes `redacted: boolean`; `ThinkingDeltaFrame` only has text delta; `ThinkingEndFrame` has optional duration. + +Current UI state stores thinking text in `ThinkingBlockState.text`, but `ThinkingBlock` does not render it. Durable run timelines also drop raw thinking text and only retain labels such as "Thinking..." or "Finished reasoning." + +Existing event basis: + +- Pi `AssistantMessageEvent` +- Murph `NormalizedEvent` thinking variants +- SDK `stream_event` +- Phantom `message.thinking_start`, `message.thinking_delta`, `message.thinking_end` + +Contract gaps: + +- No `thinkingVisibility` or `thinkingKind` field exists to say whether text is a provider summary, user-displayable thinking, private chain-of-thought, redacted, encrypted, or unknown. +- No server-side Phantom rule prevents private thinking deltas from crossing into the browser. + +### Redacted Thinking + +Pi represents redaction as `ThinkingContent.redacted` with opaque encrypted payload stored in `thinkingSignature`. Anthropic redacted thinking enters Pi as a thinking block with `redacted: true`, placeholder text, and signature data. + +Murph final assistant content maps Pi redacted thinking to `MurphRedactedThinkingBlock` with `type: "redacted_thinking"` and optional `data`. The normalized event union has a `redacted_thinking` event type, but the Pi stream translator currently maps Pi stream `thinking_start` the same way for redacted and non-redacted blocks. It does not emit a normalized redacted content-block start during streaming. + +Phantom final assistant handling recognizes `redacted_thinking` and emits `message.thinking_start` with `redacted: true` and no delta. Stream handling can also handle a `content_block_start` with `content_block.type: "redacted_thinking"`, but Murph does not reliably produce that stream shape today. + +Existing event basis: + +- Pi `ThinkingContent.redacted` +- Murph final content block `redacted_thinking` +- Phantom `message.thinking_start.redacted` + +Contract gaps: + +- Murph should emit a streaming redacted-thinking signal when the provider says a block is redacted, not only rely on final assistant reconciliation. +- The redacted payload must remain opaque and must never be displayed, parsed, summarized, or copied into user-visible text. + +### Progress Summaries + +Murph has runtime events for `tool_execution_start`, `tool_execution_update`, `tool_execution_end`, direct `tool_progress`, `subagent_start`, `subagent_progress`, `subagent_end`, `compact_started`, `compact_completed`, `session_state`, `prompt_suggestion`, `permission_request`, `permission_decision`, hooks, notifications, and errors. + +Murph maps tool execution events into `tool_progress` messages with safe previews, truncation, and secret redactions. It maps subagent events into task system messages. Phantom maps these into tool cards, subagent activity, and durable run activity labels. + +Phantom product direction says long-running tasks should never feel dead. Existing event support is enough for thinking, tool running, compaction, rate limits, subagents, blocked tools, aborted tools, done, and error. Waiting and retry states need better surfaced contracts. + +Existing event basis: + +- Murph normalized `tool_execution_*`, `tool_progress`, `subagent_*`, `compact_*`, `session_state` +- Phantom `message.tool_call_*`, `message.subagent_*`, `session.status`, `session.compact_boundary` + +Contract gaps: + +- Murph has normalized `api_retry`, and `MurphAPIRetryMessage` exists, but `NormalizedRuntimeEventMapper` does not map `api_retry` to an SDK message and Phantom has no `session.retry` or equivalent frame. +- `compact_completed` includes post-token count in Murph, but Phantom's `session.compact_boundary` only carries `pre_tokens`. + +### Token Usage + +Pi `Usage`, Murph `NormalizedUsage`, Murph `MurphUsage`, Murph `MurphModelUsage`, and Phantom `SessionDoneFrame.usage` carry normal input, output, cache, total, and cost fields. None carry separate thinking or reasoning token counts. + +OpenAI upstream exposes reasoning token counts under `output_tokens_details.reasoning_tokens`, but Pi's Responses stream processor discards that field and only persists output token totals. ZAI docs show prompt, completion, cached prompt, and total token usage in Chat Completions, not a separate reasoning token field. Anthropic usage in Pi is also reduced to input, output, cache read, cache write, total, and cost. + +Existing event basis: + +- Murph `NormalizedUsage` +- Murph `MurphUsage` +- Phantom `session.done.usage` + +Contract gaps: + +- Add `reasoningTokens?: number` or `thinkingTokens?: number` at Pi `Usage`, Murph `NormalizedUsage`, Murph `MurphUsage`, Murph `MurphModelUsage`, result messages, Phantom `SessionDoneFrame.usage`, and run timeline summaries. +- Add provider attribution for whether reasoning tokens are provider-counted, estimated, or unavailable. Do not estimate counts in UI. + +### Tool Calls + +Pi streams `toolcall_start`, `toolcall_delta`, and `toolcall_end`. Murph normalizes these as `tool_call_start`, `tool_call_delta`, and `tool_call_end`. Phantom exposes `message.tool_call_start`, `message.tool_call_input_delta`, `message.tool_call_input_end`, `message.tool_call_running`, `message.tool_call_result`, `message.tool_call_blocked`, and `message.tool_call_aborted`. + +ZAI supports `tool_stream` for streaming tool calls on supported GLM routes, and Murph's ZAI OpenAI-compatible route enables `zaiToolStream`. + +Existing event basis: + +- Pi toolcall stream events +- Murph normalized tool call and tool execution events +- Phantom tool call frames + +Contract gaps: + +- Tool cards can be richer with structured parameters, safe output references, files, and previews, but that is product/UI work rather than provider thinking semantics. + +### Rate Limits + +Murph normalized `rate_limit` has status, reset time, type, utilization, and overage status. Murph maps it to `rate_limit_event`. Phantom maps it to `session.rate_limit` with status, type, reset time, and utilization. + +Existing event basis: + +- Murph `rate_limit` +- SDK `rate_limit_event` +- Phantom `session.rate_limit` + +Contract gaps: + +- Phantom drops `overageStatus`. +- Retry and waiting states should be surfaced separately from rate-limit state. + +## Provider Capability Matrix + +| Provider route | Upstream signal | Pi/Murph today | Honest UI treatment | Token count status | +| --- | --- | --- | --- | --- | +| OpenAI via `openai-responses` | Reasoning models use private reasoning tokens. Raw reasoning tokens are not exposed through the API. Optional reasoning summaries appear only when opted in. Encrypted reasoning items can be included for continuity. Usage includes upstream `output_tokens_details.reasoning_tokens`. | Murph marks GPT-5.x and o3 routes as thinking-capable. Pi sends `reasoning.effort`, defaults `summary` to `auto`, includes `reasoning.encrypted_content`, streams reasoning summaries as Pi `thinking_*`, stores the final reasoning item JSON in `thinkingSignature`, and drops separate `reasoning_tokens` from usage. | Show active "Reasoning" state. Show summary text only after Murph/Pi labels it `summary`. Hide encrypted content and signatures. Do not claim raw chain-of-thought is visible. | Upstream count exists for OpenAI Responses. Phantom cannot show it today because Pi and Murph discard it. | +| Anthropic via `anthropic-messages` | Extended thinking can return thinking blocks, signatures, and `redacted_thinking` blocks. Anthropic docs describe redacted data as opaque encrypted content with no readable summary. Pi also supports summarized or omitted thinking display. | Murph marks Opus/Sonnet routes as thinking-capable and Haiku as not thinking-capable. Pi maps thinking blocks to `thinking_*`, maps redacted thinking to Pi `ThinkingContent.redacted`, computes standard usage only, and can choose summarized or omitted display. Murph final assistant content preserves `redacted_thinking`; streaming redaction is not clearly distinguished at normalized event level. | Show active "Reasoning" state. Show provider summary only if Murph/Pi labels the text as summary or displayable. Show "Reasoning hidden" for redacted blocks. Never display signature or `data`. | No separate thinking-token field reaches Phantom today. Do not show a count. | +| ZAI GLM via `openai-compat` config `zai` and `openai-completions` | ZAI GLM thinking is enabled by default for GLM-5.1, GLM-5, and GLM-4.7. Responses can include `reasoning_content`; streaming deltas can include `reasoning_content`, visible content, and tool calls. Preserved thinking requires exact unmodified `reasoning_content` replay. | Murph routes `glm-5` and `glm-5.1` through OpenAI-compatible completions with `thinking: true` and `toolStreaming: true`. Pi sets top-level `enable_thinking` when reasoning effort is present, enables `tool_stream`, and streams `reasoning_content`, `reasoning`, or `reasoning_text` as `thinking_*`. | Treat as private reasoning by default. Show active "Reasoning" state and timing. Do not render `reasoning_content` verbatim unless a future provider policy and Murph event label explicitly mark it user-displayable. | ZAI docs show normal prompt, completion, cache, and total usage. No separate reasoning token field reaches Phantom today. | + +## What Can Be Shown + +### Safe to Show Verbatim + +- Assistant final answer text. +- User-authored text and attachments metadata already visible to the user. +- Tool names, structured parameters, and outputs only through existing safe preview, truncation, redaction, and full-reference rules. +- Rate limit status, reset time, utilization, compaction trigger, pre-token count, subagent summaries, and run status labels. +- OpenAI reasoning summaries and Anthropic summarized thinking only after Murph/Pi explicitly labels them as summaries or user-displayable provider text. + +### Can Be Shown as Summary or Status + +- "Reasoning", "Thinking", "Reasoning hidden", "Provider reasoning summary available", "Compacting context", "Retrying provider request", "Waiting for rate limit", "Using tool", "Blocked", "Errored", and "Completed" states. +- Provider reasoning summary text when it is a real upstream summary signal, not generated from private chain-of-thought. +- Tool progress summaries generated from safe tool events. +- Token usage totals that are actually present in the event stream. + +### Must Be Hidden or Labeled Private + +- OpenAI raw reasoning tokens. OpenAI does not expose raw reasoning tokens through the API. +- OpenAI `encrypted_content` and full reasoning item payloads used only for continuity. +- Anthropic `signature` and `redacted_thinking.data`. +- Anthropic redacted thinking blocks. Display only a hidden or redacted label. +- ZAI `reasoning_content` until the event contract marks it safe display text. Treat it as private chain-of-thought, because the provider describes it as reasoning process content rather than a summary. +- Any thinking text with unknown provenance. + +## Can Phantom Show Thinking Tokens Today? + +No. + +OpenAI upstream can expose reasoning token counts, but the value is lost before Phantom: + +1. OpenAI Responses usage includes `output_tokens_details.reasoning_tokens`. +2. Pi `processResponsesStream` reads response usage and stores input, output, cache read, cache write, total, and cost only. +3. Pi `Usage` has no reasoning-token field. +4. Murph `NormalizedUsage`, `MurphUsage`, `MurphModelUsage`, and `RunAttemptResult.usage` have no reasoning-token field. +5. Phantom `SessionDoneFrame.usage`, assistant `usage_delta`, chat message state, and durable run timeline have no reasoning-token field. + +Required event-contract addition: + +```ts +type ReasoningTokenUsage = { + reasoningTokens?: number; + reasoningTokenSource?: "provider" | "unavailable"; +}; +``` + +This should be threaded from Pi provider usage parsing through Murph normalized usage and into Phantom `session.done.usage`. Phantom should display a count only when `reasoningTokenSource === "provider"` and `reasoningTokens` is a finite number. + +Do not estimate thinking tokens from text length. Do not infer hidden reasoning effort from duration. Do not show counts for providers that only expose blended output tokens. + +## Recommended Honest UI Model + +### Product Model + +Use one "Live Activity" stack driven by real events: + +- Reasoning: active or completed, with elapsed duration. +- Reasoning hidden: provider redacted, encrypted, private, or unknown. +- Provider reasoning summary: expandable only when `thinkingVisibility === "summary"` or equivalent exists. +- Tool activity: started, input streaming, running, partial output, result, error, blocked, aborted. +- Waiting and retrying: visible provider retry or rate-limit wait states. +- Compacting: explicit compaction start and completion state. +- Subagents: started, progress, completed, failed, stopped. +- Done, blocked, aborted, or errored: terminal state. + +### Event Contract + +Add a Murph/Pi thinking visibility field before rendering any thinking text: + +```ts +type ThinkingVisibility = + | "summary" + | "display_text" + | "private" + | "redacted" + | "encrypted" + | "unknown"; +``` + +Map providers conservatively: + +- OpenAI reasoning summary deltas: `summary` +- OpenAI encrypted reasoning content: `encrypted` +- Anthropic `redacted_thinking`: `redacted` +- Anthropic summarized display: `summary` or `display_text`, depending on the provider payload and selected display mode +- Anthropic omitted thinking: `encrypted` or `private` +- ZAI `reasoning_content`: `private` by default +- Any unclassified OpenAI-compatible `reasoning`, `reasoning_text`, or `reasoning_content`: `unknown` or `private` + +Phantom should only render text for `summary` and `display_text`. For all other values, it should show state, duration, and a short label. It should not ship private text to the browser when the visibility is `private`, `redacted`, `encrypted`, or `unknown`. + +### Current Safe Builder Slice + +The smallest safe slice is: + +1. Keep the current non-rendering thinking component. +2. Add server-side suppression of private or unknown thinking deltas once Murph/Pi can label them. +3. Add UI labels for `summary`, `private`, `redacted`, and `encrypted`. +4. Add retry/waiting frames for `api_retry`. +5. Add reasoning token usage only for provider-counted values. + +Until the visibility field exists, leave thinking text hidden and use only state, duration, tool progress, compaction, rate limits, and subagent progress for liveliness. + +## Provider-Specific Notes + +### OpenAI + +OpenAI docs say reasoning models generate reasoning tokens that are not visible through the API, while optional reasoning summaries can be requested. The docs also show encrypted reasoning items for stateless or zero-data-retention continuity and upstream reasoning-token usage under `output_tokens_details.reasoning_tokens`. + +Pi already requests `reasoning.encrypted_content` and defaults the summary setting to `auto` for reasoning-capable OpenAI Responses models. Pi streams `response.reasoning_summary_text.delta` as `thinking_delta`, so that stream is a provider summary, not raw chain-of-thought. The missing piece is a label that preserves this distinction for Phantom. + +OpenAI UI rule: + +- Render as "Reasoning" while active. +- Render summary text only after Murph/Pi labels it `summary`. +- Hide encrypted reasoning content and JSON signatures. +- Display reasoning token count only after the OpenAI `reasoning_tokens` field is preserved through Pi, Murph, and Phantom. + +### Anthropic + +Anthropic extended thinking can return thinking blocks, signatures, and redacted thinking. Anthropic docs say redacted thinking contains opaque encrypted data and no readable summary, and the opaque fields should be passed back unchanged for continuity when needed. + +Pi supports `thinkingDisplay: "summarized" | "omitted"`. It defaults to summarized in local code, even though the docs note model-specific display behavior. Pi maps redacted thinking to a redacted `ThinkingContent`, but the stream currently looks like a normal `thinking_start` until final content reconciliation. + +Anthropic UI rule: + +- Render as "Reasoning" while active. +- If Murph/Pi labels a displayed block as summarized thinking, optionally show it as "Provider reasoning summary". +- For `redacted_thinking`, show "Reasoning hidden" with no text. +- Never expose `signature` or redacted `data`. + +### ZAI + +ZAI docs show GLM thinking can be enabled or disabled with a `thinking` parameter, and current GLM-5.1, GLM-5, and GLM-4.7 thinking is enabled by default. The Chat Completion API includes `reasoning_content`, and streaming can include `reasoning_content`, `content`, and `tool_calls`. ZAI preserved thinking requires exact unmodified reasoning-content replay. + +Murph routes ZAI as OpenAI-compatible completions. Pi recognizes `reasoning_content`, `reasoning`, or `reasoning_text` and streams those fields as thinking. Because this is described as reasoning process content rather than an explicit summary, Phantom must treat it as private by default. + +ZAI UI rule: + +- Render as "Reasoning" while active. +- Do not display `reasoning_content` text. +- Preserve provider-required reasoning content for replay only inside the provider transport layer, not as user-visible UI. +- Use `tool_stream` events for live tool cards and progress. + +## Tests Needed + +### Murph and Pi Contract Tests + +- OpenAI Responses fixture: `response.reasoning_summary_text.delta` becomes thinking with `thinkingVisibility: "summary"`, final encrypted reasoning payload remains hidden, and upstream `reasoning_tokens` is preserved into usage. +- Anthropic fixture: `thinking` with summarized display becomes `thinkingVisibility: "summary"` or `display_text`; `redacted_thinking` becomes `thinkingVisibility: "redacted"` and emits a redacted start signal during streaming. +- ZAI fixture: streaming `delta.reasoning_content` becomes `thinkingVisibility: "private"` and is not marked displayable. +- OpenAI-compatible fallback fixture: unknown `reasoning` or `reasoning_text` fields become `private` or `unknown`, not displayable. +- Usage fixture: reasoning-token counts are included only when the upstream provider sent a dedicated field. +- Replay fixture: encrypted and redacted provider continuity payloads round-trip unchanged where required, but never become visible text. + +### Phantom Translator Tests + +- `thinkingVisibility: "summary"` maps to an explicit user-displayable summary frame. +- `private`, `redacted`, `encrypted`, and `unknown` thinking never emit `message.thinking_delta` containing raw private text. +- Existing `redacted_thinking` final assistant content still emits `message.thinking_start` with `redacted: true` and no delta. +- `api_retry` maps to a visible retry or waiting frame. +- `compact_completed` can update post-token count if Phantom chooses to show it. +- `session.done.usage.reasoning_tokens` appears only for provider-counted values. + +### Phantom UI Tests + +- Thinking card renders state, duration, and redacted/private labels without showing private text. +- Provider summary text renders only when the frame says it is displayable. +- Browser store and durable run timeline do not retain private thinking text. +- Long-running canned streams show transitions for thinking, tool running, partial output, rate limit, compacting, retrying, done, and error. +- Mobile and desktop snapshots show no layout jumps when labels change from active to completed. + +## Recommendation Checklist + +- Keep thinking text hidden by default: existing Phantom UI supports this today. +- Add thinking visibility provenance: contract gap in Pi/Murph normalized events and Phantom wire frames. +- Add server-side suppression for private thinking deltas: contract gap in Phantom translation once provenance exists. +- Preserve OpenAI reasoning token counts: contract gap in Pi usage, Murph usage, SDK result messages, Phantom frames, and timelines. +- Treat Anthropic redacted thinking as hidden: existing final content support exists, streaming redaction needs a Murph normalization fix. +- Treat ZAI `reasoning_content` as private: existing stream support exists, display safety requires visibility metadata and UI suppression. +- Surface retries and waiting: Murph has `api_retry`, but SDK and Phantom mapping are missing. +- Use existing tool progress, rate limit, compaction, and subagent events for best-in-class liveness now. diff --git a/src/chat/__tests__/http.test.ts b/src/chat/__tests__/http.test.ts index 62b72601..50ff4edf 100644 --- a/src/chat/__tests__/http.test.ts +++ b/src/chat/__tests__/http.test.ts @@ -177,6 +177,52 @@ describe("Chat HTTP handlers", () => { expect(body.run_timelines[0]?.summary.status).toBe("working"); }); + test("GET /chat/sessions/:id returns durable attachment transcript metadata", async () => { + const sessionStore = new ChatSessionStore(db); + const messageStore = new ChatMessageStore(db); + const attachmentStore = new ChatAttachmentStore(db); + const session = sessionStore.create(); + const attachmentId = attachmentStore.create({ + id: "att-1", + sessionId: session.id, + kind: "pdf", + filename: "brief.pdf", + mimeType: "application/pdf", + sizeBytes: 1234, + storagePath: "/tmp/brief.pdf", + }); + messageStore.commit({ + id: "user-1", + sessionId: session.id, + seq: 1, + role: "user", + contentJson: JSON.stringify([ + { + type: "attachment", + id: attachmentId, + filename: "brief.pdf", + mime_type: "application/pdf", + size_bytes: 1234, + preview_url: `/chat/attachments/${attachmentId}/preview`, + }, + { type: "text", text: "Review this." }, + ]), + }); + attachmentStore.commitToMessage(attachmentId, "user-1"); + + const res = await handler(makeAuthReq(`/chat/sessions/${session.id}`)); + expect(res?.status).toBe(200); + const body = (await res?.json()) as { messages: Array<{ content_json: string }> }; + const content = JSON.parse(body.messages[0]?.content_json ?? "[]") as Array>; + expect(content[0]).toMatchObject({ + type: "attachment", + id: "att-1", + filename: "brief.pdf", + mime_type: "application/pdf", + size_bytes: 1234, + }); + }); + test("GET /chat/sessions/:id returns 404 for missing session", async () => { const res = await handler(makeAuthReq("/chat/sessions/nonexistent")); expect(res?.status).toBe(404); @@ -334,4 +380,29 @@ describe("Chat HTTP handlers", () => { ); expect(res?.status).toBe(200); }); + + test("POST /chat/stream rejects invalid attachment ids", async () => { + const createRes = await handler( + makeAuthReq("/chat/sessions", { + method: "POST", + body: JSON.stringify({}), + }), + ); + const created = (await createRes?.json()) as { id: string }; + + const res = await handler( + makeAuthReq("/chat/stream", { + method: "POST", + body: JSON.stringify({ + session_id: created.id, + text: "Use the attached file.", + attachment_ids: ["missing-attachment"], + }), + }), + ); + + expect(res?.status).toBe(400); + const body = await res?.json(); + expect(body.error).toBe("attachment_not_found"); + }); }); diff --git a/src/chat/__tests__/message-builder.test.ts b/src/chat/__tests__/message-builder.test.ts index 1307d207..6c38f395 100644 --- a/src/chat/__tests__/message-builder.test.ts +++ b/src/chat/__tests__/message-builder.test.ts @@ -175,8 +175,9 @@ describe("buildUserMessageParam", () => { expect(content[3]?.text).toBe("analyze all"); }); - test("ignores non-existent attachment ids", async () => { - const msg = await buildUserMessageParam("hello", ["nonexistent-id"], attachmentStore); - expect(msg.content).toBe("hello"); + test("rejects non-existent attachment ids", async () => { + await expect(buildUserMessageParam("hello", ["nonexistent-id"], attachmentStore)).rejects.toThrow( + "Attachment is not available for this chat.", + ); }); }); diff --git a/src/chat/__tests__/sdk-to-wire.test.ts b/src/chat/__tests__/sdk-to-wire.test.ts index e646aeb3..afb89d75 100644 --- a/src/chat/__tests__/sdk-to-wire.test.ts +++ b/src/chat/__tests__/sdk-to-wire.test.ts @@ -415,6 +415,31 @@ describe("sdk-to-wire translator", () => { expect(frames.some((f) => f.event === "session.error")).toBe(true); }); + test("result error after assistant start does not emit a normal assistant_end", () => { + const ctx = makeCtx(); + translateSdkMessage( + { + type: "assistant", + message: { content: [{ type: "text", text: "Partial answer" }] }, + parent_tool_use_id: null, + }, + ctx, + ); + const frames = translateSdkMessage( + { + type: "result", + subtype: "error_during_execution", + errors: ["Provider failed"], + total_cost_usd: 0.001, + usage: {}, + duration_ms: 500, + }, + ctx, + ); + expect(frames.some((f) => f.event === "session.error")).toBe(true); + expect(frames.some((f) => f.event === "message.assistant_end")).toBe(false); + }); + test("result with prompt_suggestion -> session.suggestion", () => { const ctx = makeCtx(); const frames = translateSdkMessage({ type: "prompt_suggestion", suggestion: "Tell me more" }, ctx); diff --git a/src/chat/__tests__/writer.test.ts b/src/chat/__tests__/writer.test.ts index 68beaaea..f1fc5aba 100644 --- a/src/chat/__tests__/writer.test.ts +++ b/src/chat/__tests__/writer.test.ts @@ -1,7 +1,9 @@ import { Database } from "bun:sqlite"; import { afterEach, beforeEach, describe, expect, test } from "bun:test"; import { MIGRATIONS } from "../../db/schema.ts"; +import { ChatAttachmentStore } from "../attachment-store.ts"; import { ChatEventLog } from "../event-log.ts"; +import { buildUserTranscriptContent } from "../message-builder.ts"; import { ChatMessageStore } from "../message-store.ts"; import { ChatRunTimelineStore } from "../run-timeline.ts"; import { ChatSessionStore } from "../session-store.ts"; @@ -12,6 +14,7 @@ import { ChatSessionWriter, getActiveWriter } from "../writer.ts"; let db: Database; let sessionStore: ChatSessionStore; let messageStore: ChatMessageStore; +let attachmentStore: ChatAttachmentStore; let eventLog: ChatEventLog; let timelineStore: ChatRunTimelineStore; let streamBus: StreamBus; @@ -23,6 +26,7 @@ beforeEach(() => { } sessionStore = new ChatSessionStore(db); messageStore = new ChatMessageStore(db); + attachmentStore = new ChatAttachmentStore(db); eventLog = new ChatEventLog(db); timelineStore = new ChatRunTimelineStore(db); streamBus = new StreamBus(); @@ -100,6 +104,65 @@ describe("ChatSessionWriter", () => { expect(eventTypes).toContain("session.done"); }); + test("commits attachments to the user message and emits metadata", async () => { + const session = sessionStore.create(); + const frames: ChatWireFrame[] = []; + streamBus.subscribe(session.id, (f) => frames.push(f)); + const attachmentId = attachmentStore.create({ + sessionId: session.id, + kind: "image", + filename: "diagram.png", + mimeType: "image/png", + sizeBytes: 12, + storagePath: "/tmp/diagram.png", + }); + const attachments = [ + { + id: attachmentId, + filename: "diagram.png", + mime_type: "image/png", + size_bytes: 12, + preview_url: `/chat/attachments/${attachmentId}/preview`, + }, + ]; + + const writer = new ChatSessionWriter({ + sessionId: session.id, + runtime: mockRuntime(), + eventLog, + messageStore, + attachmentStore, + sessionStore, + streamBus, + }); + writer.claim(); + const sdkMessage = { + role: "user" as const, + content: [ + { + type: "image", + source: { type: "base64", media_type: "image/png", data: "RAW_BASE64_PAYLOAD" }, + }, + { type: "text", text: "describe this" }, + ], + } as Parameters[0]; + + await writer.run(sdkMessage, "tab1", "describe this", { + attachments, + transcriptContent: buildUserTranscriptContent("describe this", attachments), + }); + + const userFrame = frames.find((frame) => frame.event === "user.message"); + expect(userFrame?.event).toBe("user.message"); + if (userFrame?.event !== "user.message") return; + expect(userFrame.attachments).toEqual(attachments); + + const userRow = messageStore.getById(userFrame.message_id); + expect(userRow?.content_json).toBe(JSON.stringify(buildUserTranscriptContent("describe this", attachments))); + expect(userRow?.content_json).not.toContain("RAW_BASE64_PAYLOAD"); + expect(attachmentStore.getById(attachmentId)?.message_id).toBe(userFrame.message_id); + }); + test("writer sets isActive during run", async () => { const session = sessionStore.create(); let wasActive = false; @@ -165,6 +228,51 @@ describe("ChatSessionWriter", () => { expect(eventTypes).toContain("session.error"); }); + test("non-success SDK result does not commit a successful assistant message", async () => { + const session = sessionStore.create(); + const frames: ChatWireFrame[] = []; + streamBus.subscribe(session.id, (f) => frames.push(f)); + + const writer = new ChatSessionWriter({ + sessionId: session.id, + runtime: mockRuntime({ + runForChat: async (_key, _message, opts) => { + opts.onSdkEvent({ + type: "result", + subtype: "error_during_execution", + errors: ["Provider failed"], + total_cost_usd: 0.02, + usage: { input_tokens: 10, output_tokens: 0 }, + duration_ms: 15, + num_turns: 1, + }); + return { + text: "", + sessionId: "sdk-1", + cost: { totalUsd: 0.02, inputTokens: 10, outputTokens: 0, modelUsage: {} }, + durationMs: 15, + }; + }, + }), + eventLog, + messageStore, + sessionStore, + timelineStore, + streamBus, + }); + writer.claim(); + + await writer.run({ role: "user", content: "fail" }, "t1", "fail"); + + const messages = messageStore.getBySession(session.id); + expect(messages.map((message) => message.role)).toEqual(["user"]); + expect(frames.some((frame) => frame.event === "session.error")).toBe(true); + expect(frames.some((frame) => frame.event === "session.done")).toBe(false); + const timelines = timelineStore.getDetailsBySession(session.id); + expect(timelines[0]?.status).toBe("error"); + expect(timelines[0]?.assistant_message_id).toBeNull(); + }); + test("multi-subscriber fan-out delivers to all", async () => { const session = sessionStore.create(); const frames1: ChatWireFrame[] = []; diff --git a/src/chat/http-handlers.ts b/src/chat/http-handlers.ts index 42a9adf5..073a14c5 100644 --- a/src/chat/http-handlers.ts +++ b/src/chat/http-handlers.ts @@ -1,11 +1,8 @@ -// Session-specific and streaming route handlers for the chat HTTP API. -// Split from http.ts to keep both files under 300 lines. - import type { SDKUserMessage } from "../agent/agent-sdk.ts"; type MessageParam = SDKUserMessage["message"]; import type { ChatHandlerDeps } from "./http.ts"; -import { buildUserMessageParam } from "./message-builder.ts"; +import { type BuiltUserMessage, ChatAttachmentResolutionError, buildUserMessage } from "./message-builder.ts"; import { CHAT_SSE_HEADERS, CHAT_SSE_RETRY_MS, @@ -84,28 +81,35 @@ export async function handleStream(req: Request, deps: ChatHandlerDeps): Promise } catch { return Response.json({ error: "Invalid JSON" }, { status: 400 }); } - if (!body.session_id || !body.text) { return Response.json({ error: "session_id and text are required" }, { status: 400 }); } - const existingWriter = getActiveWriter(body.session_id); if (existingWriter?.isActive) { return Response.json({ error: "Session busy" }, { status: 409 }); } - const session = deps.sessionStore.get(body.session_id); - if (!session) { - return Response.json({ error: "Session not found" }, { status: 404 }); - } - + if (!session) return Response.json({ error: "Session not found" }, { status: 404 }); const tabId = body.tab_id ?? "default"; const attachmentIds = body.attachment_ids ?? []; - let message: MessageParam; + let message: MessageParam = { role: "user", content: body.text }; + let writerOptions: Pick | undefined; if (attachmentIds.length > 0) { - message = await buildUserMessageParam(body.text, attachmentIds, deps.attachmentStore); - } else { - message = { role: "user", content: body.text }; + try { + const builtMessage = await buildUserMessage(body.text, attachmentIds, body.session_id, deps.attachmentStore); + message = builtMessage.message; + writerOptions = { attachments: builtMessage.attachments, transcriptContent: builtMessage.transcriptContent }; + } catch (err: unknown) { + if (err instanceof ChatAttachmentResolutionError) { + return Response.json({ error: err.code, message: err.message }, { status: 400 }); + } + const messageText = err instanceof Error ? err.message : String(err); + console.error(`[chat-http] Attachment build failed for session ${body.session_id}: ${messageText}`); + return Response.json( + { error: "attachment_read_failed", message: "Could not read one or more attachments." }, + { status: 500 }, + ); + } } const writer = new ChatSessionWriter({ @@ -113,6 +117,7 @@ export async function handleStream(req: Request, deps: ChatHandlerDeps): Promise runtime: deps.runtime, eventLog: deps.eventLog, messageStore: deps.messageStore, + attachmentStore: deps.attachmentStore, sessionStore: deps.sessionStore, timelineStore: deps.timelineStore, streamBus: deps.streamBus, @@ -123,14 +128,12 @@ export async function handleStream(req: Request, deps: ChatHandlerDeps): Promise const sessionId = body.session_id; const stream = createSSEStream(sessionId, deps.streamBus, writer); - writer.run(message, tabId, body.text).catch((err: unknown) => { + writer.run(message, tabId, body.text, writerOptions).catch((err: unknown) => { const msg = err instanceof Error ? err.message : String(err); console.error(`[chat-http] Writer error for session ${sessionId}: ${msg}`); }); - return new Response(stream, { - headers: CHAT_SSE_HEADERS, - }); + return new Response(stream, { headers: CHAT_SSE_HEADERS }); } export async function handleResume(req: Request, sessionId: string, deps: ChatHandlerDeps): Promise { @@ -261,9 +264,7 @@ export async function handleResume(req: Request, sessionId: string, deps: ChatHa }, }); - return new Response(stream, { - headers: CHAT_SSE_HEADERS, - }); + return new Response(stream, { headers: CHAT_SSE_HEADERS }); } export function handleAbort(sessionId: string): Response { diff --git a/src/chat/message-builder.ts b/src/chat/message-builder.ts index cd01f946..50f40be8 100644 --- a/src/chat/message-builder.ts +++ b/src/chat/message-builder.ts @@ -21,23 +21,104 @@ type ContentBlock = { title?: string; }; +export type UserAttachmentMetadata = { + id: string; + filename: string; + mime_type: string; + size_bytes: number | null; + preview_url: string; +}; + +export type UserTranscriptContentBlock = + | (UserAttachmentMetadata & { type: "attachment" }) + | { type: "text"; text: string }; + +export type BuiltUserMessage = { + message: MessageParam; + attachments: UserAttachmentMetadata[]; + transcriptContent: string | UserTranscriptContentBlock[]; +}; + +export type AttachmentResolutionCode = "attachment_not_found" | "attachment_wrong_session" | "attachment_already_sent"; + +export class ChatAttachmentResolutionError extends Error { + readonly code: AttachmentResolutionCode; + + constructor(code: AttachmentResolutionCode) { + const message = + code === "attachment_already_sent" + ? "Attachment has already been sent." + : "Attachment is not available for this chat."; + super(message); + this.name = "ChatAttachmentResolutionError"; + this.code = code; + } +} + +export async function buildUserMessage( + text: string, + attachmentIds: string[], + sessionId: string, + attachmentStore: ChatAttachmentStore, +): Promise { + const attachments = resolveUserMessageAttachments(attachmentIds, attachmentStore, sessionId); + const metadata = attachments.map(attachmentToMetadata); + const message = await buildMessageParamFromAttachments(text, attachments); + return { + message, + attachments: metadata, + transcriptContent: buildUserTranscriptContent(text, metadata), + }; +} + export async function buildUserMessageParam( text: string, attachmentIds: string[], attachmentStore: ChatAttachmentStore, ): Promise { - if (attachmentIds.length === 0) { - return { role: "user", content: text }; - } + const attachments = resolveUserMessageAttachments(attachmentIds, attachmentStore); + return buildMessageParamFromAttachments(text, attachments); +} + +export function buildUserTranscriptContent( + text: string, + attachments: UserAttachmentMetadata[], +): string | UserTranscriptContentBlock[] { + if (attachments.length === 0) return text; + return [...attachments.map((attachment) => ({ ...attachment, type: "attachment" as const })), { type: "text", text }]; +} - const attachments: ChatAttachment[] = []; +function resolveUserMessageAttachments( + attachmentIds: string[], + attachmentStore: ChatAttachmentStore, + sessionId?: string, +): ChatAttachment[] { + if (attachmentIds.length === 0) return []; - for (const id of attachmentIds) { + return attachmentIds.map((id) => { const att = attachmentStore.getById(id); - if (att) attachments.push(att); - } + if (!att) throw new ChatAttachmentResolutionError("attachment_not_found"); + if (sessionId && att.session_id !== sessionId) { + throw new ChatAttachmentResolutionError("attachment_wrong_session"); + } + if (att.message_id !== null) { + throw new ChatAttachmentResolutionError("attachment_already_sent"); + } + return att; + }); +} + +function attachmentToMetadata(att: ChatAttachment): UserAttachmentMetadata { + return { + id: att.id, + filename: att.filename ?? "file", + mime_type: att.mime_type ?? "application/octet-stream", + size_bytes: att.size_bytes, + preview_url: `/chat/attachments/${att.id}/preview`, + }; +} - // All IDs were invalid - fall back to plain text +async function buildMessageParamFromAttachments(text: string, attachments: ChatAttachment[]): Promise { if (attachments.length === 0) { return { role: "user", content: text }; } diff --git a/src/chat/sdk-to-wire.ts b/src/chat/sdk-to-wire.ts index ab75f8b4..85b0512f 100644 --- a/src/chat/sdk-to-wire.ts +++ b/src/chat/sdk-to-wire.ts @@ -211,7 +211,7 @@ function handleResult(msg: Record, ctx: TranslationContext): Ch const durationMs = (msg.duration_ms as number) ?? 0; const numTurns = (msg.num_turns as number) ?? 1; - if (ctx.assistantStartEmitted && !ctx.assistantEndEmitted) { + if (subtype === "success" && ctx.assistantStartEmitted && !ctx.assistantEndEmitted) { frames.push({ event: "message.assistant_end", message_id: ctx.messageId, diff --git a/src/chat/types.ts b/src/chat/types.ts index da57be1a..d20c59be 100644 --- a/src/chat/types.ts +++ b/src/chat/types.ts @@ -140,7 +140,13 @@ export type UserMessageFrame = { event: "user.message"; message_id: string; text: string; - attachments: Array<{ id: string; filename: string; mime_type: string }>; + attachments: Array<{ + id: string; + filename: string; + mime_type: string; + size_bytes: number | null; + preview_url: string; + }>; sent_at: string; source_tab_id: string; }; diff --git a/src/chat/upload.ts b/src/chat/upload.ts index 2e2293cb..ff63ea59 100644 --- a/src/chat/upload.ts +++ b/src/chat/upload.ts @@ -20,6 +20,7 @@ export type UploadDeps = { export type AcceptedAttachment = { id: string; + client_id?: string; filename: string; mime_type: string; size: number; @@ -27,6 +28,7 @@ export type AcceptedAttachment = { }; export type RejectedAttachment = { + client_id?: string; filename: string; reason: string; message: string; @@ -61,17 +63,23 @@ export async function handleUploadAttachments(req: Request, sessionId: string, d } const files = formData.getAll("file").filter((v): v is File => v instanceof File); + const clientIds = formData + .getAll("client_id") + .map((value) => (typeof value === "string" && value.length > 0 ? value : null)); if (files.length === 0) { return Response.json({ error: "no_files", message: "No files attached." }, { status: 400 }); } + const uploadItems = files.map((file, index) => ({ file, clientId: clientIds[index] ?? null })); + if (files.length > MAX_FILES_PER_REQUEST) { // Take the first MAX_FILES_PER_REQUEST, reject the rest - const toProcess = files.slice(0, MAX_FILES_PER_REQUEST); - const overflow = files.slice(MAX_FILES_PER_REQUEST); - const overflowRejected = overflow.map((f) => ({ - filename: f.name, + const toProcess = uploadItems.slice(0, MAX_FILES_PER_REQUEST); + const overflow = uploadItems.slice(MAX_FILES_PER_REQUEST); + const overflowRejected = overflow.map((item) => ({ + ...(item.clientId ? { client_id: item.clientId } : {}), + filename: item.file.name, reason: "limit_exceeded", message: `Limit of ${MAX_FILES_PER_REQUEST} files per upload reached.`, })); @@ -81,25 +89,36 @@ export async function handleUploadAttachments(req: Request, sessionId: string, d return Response.json({ attachments: result.attachments, rejected: result.rejected }, { status }); } - const result = await processFiles(files, sessionId, deps); + const result = await processFiles(uploadItems, sessionId, deps); const status = result.rejected.length === 0 ? 200 : result.attachments.length === 0 ? 400 : 207; return Response.json({ attachments: result.attachments, rejected: result.rejected }, { status }); } +type UploadItem = { + file: File; + clientId: string | null; +}; + async function processFiles( - files: File[], + files: UploadItem[], sessionId: string, deps: UploadDeps, ): Promise<{ attachments: AcceptedAttachment[]; rejected: RejectedAttachment[] }> { const accepted: AcceptedAttachment[] = []; const rejected: RejectedAttachment[] = []; - for (const file of files) { + for (const item of files) { + const { file } = item; const mime = file.type || guessMimeFromName(file.name) || ""; const validation = validateFile(mime, file.size, file.name); if (!validation.ok) { - rejected.push({ filename: file.name, reason: validation.reason, message: validation.message }); + rejected.push({ + ...(item.clientId ? { client_id: item.clientId } : {}), + filename: file.name, + reason: validation.reason, + message: validation.message, + }); console.log(`[chat-upload] sessionId=${sessionId} file=${file.name} reason=${validation.reason}`); continue; } @@ -125,6 +144,7 @@ async function processFiles( accepted.push({ id, + ...(item.clientId ? { client_id: item.clientId } : {}), filename: sanitizeFilename(file.name), mime_type: mime, size: file.size, @@ -135,7 +155,12 @@ async function processFiles( } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); console.error(`[chat-upload] write failed for ${file.name}: ${msg}`); - rejected.push({ filename: file.name, reason: "storage_failed", message: "Could not save file. Please retry." }); + rejected.push({ + ...(item.clientId ? { client_id: item.clientId } : {}), + filename: file.name, + reason: "storage_failed", + message: "Could not save file. Please retry.", + }); } } diff --git a/src/chat/writer.ts b/src/chat/writer.ts index e87906a9..806331ef 100644 --- a/src/chat/writer.ts +++ b/src/chat/writer.ts @@ -2,9 +2,11 @@ import type { SDKUserMessage } from "../agent/agent-sdk.ts"; type MessageParam = SDKUserMessage["message"]; import type { AgentRuntime } from "../agent/runtime.ts"; +import type { ChatAttachmentStore } from "./attachment-store.ts"; import { autoRenameSession } from "./auto-rename.ts"; import { buildChatContinuityContext } from "./continuity-context.ts"; import type { ChatEventLog } from "./event-log.ts"; +import type { UserAttachmentMetadata, UserTranscriptContentBlock } from "./message-builder.ts"; import type { ChatMessageStore } from "./message-store.ts"; import type { NotificationTriggerService } from "./notifications/triggers.ts"; import type { ChatRunTimelineStore } from "./run-timeline.ts"; @@ -19,12 +21,18 @@ export type ChatSessionWriterDeps = { runtime: AgentRuntime; eventLog: ChatEventLog; messageStore: ChatMessageStore; + attachmentStore?: ChatAttachmentStore; sessionStore: ChatSessionStore; timelineStore?: ChatRunTimelineStore; streamBus: StreamBus; notificationTriggers?: NotificationTriggerService; }; +export type ChatSessionWriterRunOptions = { + attachments?: UserAttachmentMetadata[]; + transcriptContent?: string | UserTranscriptContentBlock[]; +}; + // Active writers keyed by sessionId for abort and busy-check lookups const activeWriters = new Map(); @@ -54,22 +62,35 @@ export class ChatSessionWriter { activeWriters.set(this.deps.sessionId, this); } - async run(message: MessageParam, tabId: string, userText: string): Promise { + async run( + message: MessageParam, + tabId: string, + userText: string, + options?: ChatSessionWriterRunOptions, + ): Promise { if (!this.running) { throw new Error("Writer must be claimed before run()"); } this.abortController = new AbortController(); + const attachments = options?.attachments ?? []; + if (attachments.length > 0 && !this.deps.attachmentStore) { + throw new Error("Attachment store is required to commit chat attachments"); + } const seqCounter = { current: this.deps.eventLog.getMaxSeq(this.deps.sessionId) }; const msgSeq = this.deps.messageStore.getMaxSeq(this.deps.sessionId) + 1; + const transcriptContent = options?.transcriptContent ?? userText; const userMessageId = this.deps.messageStore.commit({ sessionId: this.deps.sessionId, seq: msgSeq, role: "user", - contentJson: JSON.stringify(typeof message === "string" ? message : message.content), + contentJson: JSON.stringify(transcriptContent), }); + for (const attachment of attachments) { + this.deps.attachmentStore?.commitToMessage(attachment.id, userMessageId); + } this.deps.sessionStore.incrementMessageCount(this.deps.sessionId); this.deps.sessionStore.setFirstUserMessageAt(this.deps.sessionId); @@ -77,7 +98,7 @@ export class ChatSessionWriter { event: "user.message", message_id: userMessageId, text: userText, - attachments: [], + attachments, sent_at: new Date().toISOString(), source_tab_id: tabId, }; @@ -98,6 +119,7 @@ export class ChatSessionWriter { const sessionKey = `web:${this.deps.sessionId}`; const startTime = Date.now(); let resultText = ""; + let terminalErrorMessage: string | null = null; try { const sessionContext = buildChatContinuityContext({ @@ -110,6 +132,9 @@ export class ChatSessionWriter { onSdkEvent: (sdkMsg: unknown) => { const frames = translateSdkMessage(sdkMsg as Record, ctx); for (const frame of frames) { + if (frame.event === "session.error") { + terminalErrorMessage = frame.errors[0] ?? "Run failed."; + } const seq = this.emitFrame(frame, seqCounter); if (timeline.apply(frame, seq)) { this.persistTimeline(timeline); @@ -120,6 +145,19 @@ export class ChatSessionWriter { resultText = response.text; + if (terminalErrorMessage) { + this.deps.sessionStore.updateCost(this.deps.sessionId, response.cost); + if (this.deps.notificationTriggers) { + this.deps.notificationTriggers + .onHardError(this.deps.sessionId, terminalErrorMessage) + .catch((triggerErr: unknown) => { + const msg = triggerErr instanceof Error ? triggerErr.message : String(triggerErr); + console.warn(`[push] trigger failed: ${msg}`); + }); + } + return; + } + this.deps.messageStore.commit({ id: assistantMessageId, sessionId: this.deps.sessionId, From 270589ee147c85c57c15850bba11e131298e1d9b Mon Sep 17 00:00:00 2001 From: Truffle Date: Fri, 1 May 2026 00:00:49 -0700 Subject: [PATCH 3/9] Add bounded transcript recovery for chat compaction --- prompts/phase-10i-chat-detail-research.md | 78 +++ .../phase-10i-memory-architecture-research.md | 76 +++ .../phase-10i-transcript-recovery-builder.md | 92 +++ ...10i-transcript-recovery-final-re-review.md | 47 ++ ...phase-10i-transcript-recovery-re-review.md | 79 +++ .../phase-10i-transcript-recovery-review.md | 53 ++ .../phase-10i-transcript-search-research.md | 69 +++ .../phase-10i-chat-detail-research.md | 502 ++++++++++++++++ .../phase-10i-memory-architecture-research.md | 251 ++++++++ ...10i-transcript-recovery-final-re-review.md | 36 ++ ...-10i-transcript-recovery-implementation.md | 90 +++ ...phase-10i-transcript-recovery-re-review.md | 84 +++ .../phase-10i-transcript-recovery-review.md | 69 +++ .../phase-10i-transcript-search-research.md | 553 ++++++++++++++++++ .../reflective-transcript-tools.test.ts | 106 ++++ src/agent/chat-query.ts | 21 +- src/agent/in-process-reflective-tools.ts | 69 ++- src/agent/mcp-server-factory.ts | 12 + src/agent/runtime.ts | 12 +- src/chat/__tests__/continuity-context.test.ts | 12 + src/chat/__tests__/transcript-search.test.ts | 230 ++++++++ src/chat/continuity-context.ts | 12 +- src/chat/redaction.ts | 31 + src/chat/run-timeline.ts | 27 +- src/chat/transcript-search.ts | 250 ++++++++ src/index.ts | 5 +- 26 files changed, 2815 insertions(+), 51 deletions(-) create mode 100644 prompts/phase-10i-chat-detail-research.md create mode 100644 prompts/phase-10i-memory-architecture-research.md create mode 100644 prompts/phase-10i-transcript-recovery-builder.md create mode 100644 prompts/phase-10i-transcript-recovery-final-re-review.md create mode 100644 prompts/phase-10i-transcript-recovery-re-review.md create mode 100644 prompts/phase-10i-transcript-recovery-review.md create mode 100644 prompts/phase-10i-transcript-search-research.md create mode 100644 research/chat-experience/phase-10i-chat-detail-research.md create mode 100644 research/chat-experience/phase-10i-memory-architecture-research.md create mode 100644 research/chat-experience/phase-10i-transcript-recovery-final-re-review.md create mode 100644 research/chat-experience/phase-10i-transcript-recovery-implementation.md create mode 100644 research/chat-experience/phase-10i-transcript-recovery-re-review.md create mode 100644 research/chat-experience/phase-10i-transcript-recovery-review.md create mode 100644 research/chat-experience/phase-10i-transcript-search-research.md create mode 100644 src/agent/__tests__/reflective-transcript-tools.test.ts create mode 100644 src/agent/mcp-server-factory.ts create mode 100644 src/chat/__tests__/transcript-search.test.ts create mode 100644 src/chat/redaction.ts create mode 100644 src/chat/transcript-search.ts diff --git a/prompts/phase-10i-chat-detail-research.md b/prompts/phase-10i-chat-detail-research.md new file mode 100644 index 00000000..b2872e13 --- /dev/null +++ b/prompts/phase-10i-chat-detail-research.md @@ -0,0 +1,78 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product manager at Anthropic, all three at once. You are researching the next best-in-class chat detail slice for Phantom. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No "v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Mission + +Audit Phantom chat with obsessive product detail and propose the next implementation slices for: + +1. Long-running agent activity that never feels dead. +2. Tool cards that are collapsed by default but useful at a glance. +3. First-class files, pages, and artifacts. +4. Minor visual details: borders, spacing, icons, copy/open affordances, markdown, empty/loading/error states. +5. User-visible memory and agent-visible memory surfaces. + +Write your report to: + +`/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-chat-detail-research.md` + +Do not edit application code. + +## Required Reading + +Read these files directly: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +4. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-live-verification.md` +6. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-phantom-chat-review.md` +7. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/message-list.tsx` +8. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/run-activity-row.tsx` +9. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` +10. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx` +11. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/user-message.tsx` +12. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/chat-input.tsx` +13. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/markdown.tsx` +14. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-activity.ts` +15. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts` +16. `/Users/truffle/work/phantom-murph-hardening/src/chat/run-timeline.ts` +17. `/Users/truffle/work/phantom-murph-hardening/src/ui/tools.ts` +18. `/Users/truffle/work/phantom-murph-hardening/src/ui/preview.ts` + +Also inspect Pi web UI patterns in `/Users/truffle/work/pi-mono/packages/web-ui/src` where relevant. + +## Questions To Answer + +1. What should be fixed next for the user to feel the agent is alive during long work? +2. What tool-card collapsed information is missing today? +3. How should generated pages/files be rendered in chat without requiring raw JSON reading? +4. Which icons should be used from `lucide-react` for memory, transcript search, artifacts, files, pages, tool states, and progress? +5. What minor border/spacing/layout issues are visible from the current code and screenshots? +6. What should be built in the next one or two PRs, and what should wait? +7. How should we visually verify each slice with Playwright screenshots? + +## Design Constraints + +- Use existing React/Tailwind/lucide patterns. +- Do not build a landing page or marketing layer. +- Avoid one-note color changes. Keep visual changes restrained and practical. +- Do not use emojis as primary UI primitives. +- Do not expose hidden reasoning or sensitive tool output. +- Prefer feature-complete small slices over broad partial redesigns. + +## Deliverable Format + +Write: + +1. Executive recommendation. +2. Current UI findings by component. +3. Concrete next slices, in order. +4. Visual acceptance criteria. +5. Playwright verification plan. +6. Risks and edge cases. + +ultrathink. The target is a chat surface people choose because it feels alive, legible, and trustworthy. diff --git a/prompts/phase-10i-memory-architecture-research.md b/prompts/phase-10i-memory-architecture-research.md new file mode 100644 index 00000000..060e7f0d --- /dev/null +++ b/prompts/phase-10i-memory-architecture-research.md @@ -0,0 +1,76 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product manager at Anthropic, all three at once. You are researching Phase 10I for Phantom on Murph in `/Users/truffle/work/phantom-murph-hardening`. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No "v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Mission + +Research the correct production architecture for dual memory in Phantom chat: + +1. A user-visible memory surface: things the operator can see, understand, edit, and trust. +2. An agent-visible operational memory surface: things the agent needs to continue work after compaction, remember task state, and avoid asking the user to repeat context. + +The immediate product direction is that post-compaction work should continue naturally. The agent should have enough historical transcript access to recover details that were compacted away, but this must not turn into a giant prompt dump or unsafe leakage of hidden tool internals. + +Write your report to: + +`/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-memory-architecture-research.md` + +Do not edit application code. + +## Required Reading + +Read these files directly: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/murph/QUALITY-BAR.md` +4. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +6. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-live-verification.md` +7. `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-assembler.ts` +8. `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-blocks/agent-memory-instructions.ts` +9. `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-blocks/working-memory.ts` +10. `/Users/truffle/work/phantom-murph-hardening/src/agent/chat-query.ts` +11. `/Users/truffle/work/phantom-murph-hardening/src/agent/murph-context.ts` +12. `/Users/truffle/work/phantom-murph-hardening/src/agent/in-process-reflective-tools.ts` +13. `/Users/truffle/work/phantom-murph-hardening/src/memory/context-builder.ts` +14. `/Users/truffle/work/phantom-murph-hardening/src/memory/system.ts` +15. `/Users/truffle/work/phantom-murph-hardening/src/ui/api/memory-files.ts` + +Also inspect any tests that define the intended behavior. + +## Questions To Answer + +1. What memory surfaces already exist, by exact file path and behavior? +2. Which surface is currently user-visible, which is agent-visible, and which is both? +3. What does the agent already receive in the prompt before a run? +4. What does the agent already get through tools such as `phantom_memory_search` and `phantom_history`? +5. What should "for-agent memory" mean in product terms, and where should it live? +6. What should "for-user memory" mean in product terms, and where should it render? +7. What should never be shown to users, but may be used by the agent for continuity? +8. What is the smallest safe next builder slice that improves post-compaction continuity without over-building? + +## Design Constraints + +- Keep Murph generic. Phantom-specific memory UI and session history belong in Phantom. +- Do not expose raw chain-of-thought. +- Do not dump entire transcripts into prompts. +- Do not store raw base64, credentials, or unredacted tool full outputs in user memory. +- Prefer existing storage and APIs before adding new systems. +- A future cloud deployment must be durable across process restarts. + +## Deliverable Format + +Write: + +1. Executive recommendation. +2. Current state inventory. +3. Proposed memory taxonomy. +4. Proposed agent prompt/tool contract. +5. Proposed user-visible UI/API contract. +6. Risks and privacy boundaries. +7. Smallest next builder slice with exact files likely touched and tests required. + +ultrathink. Be specific. Cite exact files and functions. Do not rely on summaries. diff --git a/prompts/phase-10i-transcript-recovery-builder.md b/prompts/phase-10i-transcript-recovery-builder.md new file mode 100644 index 00000000..6d488d92 --- /dev/null +++ b/prompts/phase-10i-transcript-recovery-builder.md @@ -0,0 +1,92 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product manager at Anthropic, all three at once. You are building Phase 10I for Phantom on Murph in `/Users/truffle/work/phantom-murph-hardening`. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No "v2" thinking. There is no v2. Build it right. + +## Mission + +Add the smallest production-ready transcript recovery slice so a Phantom chat agent can continue naturally after Murph compaction. + +The agent needs access to a compact, cited, redacted view of the durable pre-compaction transcript. It must not dump the whole transcript into the provider context, and it must not ask the user to repeat context when Phantom can safely recover it from the chat database. + +## Required Reading + +Read these files directly before implementation: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/murph/QUALITY-BAR.md` +4. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +5. `/Users/truffle/work/phantom-murph-hardening/src/agent/in-process-reflective-tools.ts` +6. `/Users/truffle/work/phantom-murph-hardening/src/chat/continuity-context.ts` +7. `/Users/truffle/work/phantom-murph-hardening/src/chat/message-store.ts` +8. `/Users/truffle/work/phantom-murph-hardening/src/agent/murph-context.ts` +9. `/Users/truffle/work/phantom-murph-hardening/src/agent/chat-query.ts` +10. `/Users/truffle/work/murph/packages/core/src/compaction/compact.ts` +11. `/Users/truffle/work/murph/packages/core/src/compaction/summary-prompt.ts` +12. `/Users/truffle/work/pi-mono/packages/coding-agent/docs/compaction.md` + +## Implementation Scope + +Build a current-session transcript search capability: + +1. Add a helper that reads durable `chat_messages` rows for a session. +2. Return compact search results with `seq`, `role`, `created_at`, and redacted snippets. +3. Support a query, role filter, seq window, and result limit. +4. Redact secrets, API keys, authorization fields, cookies, and magic-login tokens. +5. Preserve page URLs and normal opaque IDs, because those are often exactly what the agent needs to recover. +6. Register the helper as an in-process MCP tool on `phantom-reflective`. +7. Update chat continuity context so every Murph chat run includes the current chat session id and instructs the agent to use transcript search when earlier context is missing after compaction. +8. Keep tool output compact and safe for model context. + +## Non-Goals + +- Do not create a vector search system. +- Do not expose raw provider reasoning. +- Do not expose attachment base64. +- Do not create a user-facing memory UI in this slice. +- Do not alter Murph compaction internals unless the tests prove Phantom is using them incorrectly. +- Do not change broad chat UI visuals in this slice. + +## Acceptance Criteria + +1. A post-compaction agent can discover the current chat session id from `# Current Chat Context`. +2. The agent has an in-process tool named `phantom_chat_transcript_search`. +3. The tool can find an earlier user or assistant detail by text query in the current session. +4. The tool can list recent transcript entries when no query is given. +5. Tool results are compact, cited by seq, and redacted. +6. Login magic tokens are not returned. +7. Existing memory search and session listing tools continue to exist. +8. Unit tests cover query search, recent lookup, redaction, role filters, and continuity context injection. +9. Full local checks pass before commit. + +## Verification + +Run focused tests first: + +- `bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts` +- `bun test src/agent/__tests__/reflective-transcript-tools.test.ts` + +Then run the broader gate required by the quality bar: + +- `bun test` +- `bun run lint` +- `bun run typecheck` +- `cd chat-ui && bun test` +- `cd chat-ui && bun run typecheck` +- `cd chat-ui && bun run build` + +After local checks, run a live Phantom-on-Murph browser smoke with a long session if time permits. + +## Anti-Patterns + +- Do not stringify whole messages without truncation. +- Do not let query text become raw SQL. +- Do not use explicit `any`. +- Do not use `@ts-ignore`. +- Do not ask the user to repeat information that exists in the transcript. +- Do not call a login link a created page URL. +- Do not hide tool failures as successful empty assistant responses. + +ultrathink. Build the runtime affordance that makes compaction feel trustworthy instead of forgetful. diff --git a/prompts/phase-10i-transcript-recovery-final-re-review.md b/prompts/phase-10i-transcript-recovery-final-re-review.md new file mode 100644 index 00000000..69031107 --- /dev/null +++ b/prompts/phase-10i-transcript-recovery-final-re-review.md @@ -0,0 +1,47 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are doing a final narrow re-review +of Phase 10I transcript recovery after the previous re-review found one +remaining P2: attachment metadata bypassed redaction. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No +"v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Mission + +Verify whether the attachment metadata redaction P2 is now resolved and whether +the narrow fix introduced any new P0, P1, or P2 issue. + +## Required Reading + +Read from disk: + +1. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-re-review.md` +2. `src/chat/transcript-search.ts` +3. `src/chat/redaction.ts` +4. `src/chat/__tests__/transcript-search.test.ts` +5. `src/agent/__tests__/reflective-transcript-tools.test.ts` + +Also inspect the current diff for related files. + +## Required Verification + +Run: + +```bash +bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts src/agent/__tests__/reflective-transcript-tools.test.ts src/chat/__tests__/run-timeline.test.ts src/agent/__tests__/agent-sdk-boundary-callers.test.ts src/agent/__tests__/murph-context.test.ts +bun run lint +bun run typecheck +``` + +## Output Contract + +Write your final report to: + +`/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-final-re-review.md` + +Lead with findings. If there are no P0, P1, or P2 findings, say so explicitly. +Do not edit production code. Do not commit. + +ultrathink before finalizing. Actual files and command output are evidence. diff --git a/prompts/phase-10i-transcript-recovery-re-review.md b/prompts/phase-10i-transcript-recovery-re-review.md new file mode 100644 index 00000000..ef0b9739 --- /dev/null +++ b/prompts/phase-10i-transcript-recovery-re-review.md @@ -0,0 +1,79 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are independently re-reviewing the +Phase 10I Phantom-on-Murph transcript recovery fixes after a prior reviewer +found two P2 issues. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No +"v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Mission + +Review the latest worktree and decide whether the previous P2 findings are +resolved without introducing new P0, P1, or P2 regressions. + +## Required Reading + +Read these files from disk, not summaries: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/murph/QUALITY-BAR.md` +4. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-review.md` +6. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-implementation.md` + +Then inspect the changed source and tests: + +1. `src/agent/mcp-server-factory.ts` +2. `src/agent/runtime.ts` +3. `src/agent/chat-query.ts` +4. `src/agent/in-process-reflective-tools.ts` +5. `src/chat/continuity-context.ts` +6. `src/chat/transcript-search.ts` +7. `src/chat/redaction.ts` +8. `src/chat/run-timeline.ts` +9. `src/chat/__tests__/transcript-search.test.ts` +10. `src/agent/__tests__/reflective-transcript-tools.test.ts` +11. `src/chat/__tests__/continuity-context.test.ts` + +## Questions To Answer + +1. Is the soft-delete lifecycle fixed against the real + `ChatSessionStore.softDelete` path? +2. Is transcript search restricted to the current bound Phantom web chat + session? +3. Does the new MCP factory context preserve existing non-chat MCP factories + and main runtime paths? +4. Is transcript output sufficiently redacted for credential-shaped values, + private keys, AWS keys, magic links, cookies, and base64 payloads? +5. Does structured extraction avoid dumping unknown raw objects or attachment + payloads? +6. Did the shared redaction helper preserve durable run timeline behavior? +7. Are the tests adequate for this slice? + +## Required Verification + +Run at minimum: + +```bash +bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts src/agent/__tests__/reflective-transcript-tools.test.ts src/chat/__tests__/run-timeline.test.ts src/agent/__tests__/agent-sdk-boundary-callers.test.ts src/agent/__tests__/murph-context.test.ts +bun run lint +bun run typecheck +``` + +You may run additional commands if needed. + +## Output Contract + +Write the report to: + +`/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-re-review.md` + +Lead with findings, ordered by severity. If there are no P0, P1, or P2 +findings, say that explicitly. Include commands run and their results. Do not +edit production code. Do not commit. + +ultrathink before finalizing. Treat agent summaries as non-evidence; read the +actual files and command output. diff --git a/prompts/phase-10i-transcript-recovery-review.md b/prompts/phase-10i-transcript-recovery-review.md new file mode 100644 index 00000000..672ecd16 --- /dev/null +++ b/prompts/phase-10i-transcript-recovery-review.md @@ -0,0 +1,53 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, principal product manager, and security reviewer at Anthropic, all at once. Review the Phase 10I transcript recovery slice in `/Users/truffle/work/phantom-murph-hardening`. + +No context anxiety. No time pressure. Be exact. Read actual files and tests. Do not rely on summaries. + +## Scope + +Review these changed production files: + +1. `src/chat/transcript-search.ts` +2. `src/chat/continuity-context.ts` +3. `src/agent/in-process-reflective-tools.ts` + +Review these tests: + +1. `src/chat/__tests__/transcript-search.test.ts` +2. `src/chat/__tests__/continuity-context.test.ts` +3. `src/agent/__tests__/reflective-transcript-tools.test.ts` + +Also read relevant current research: + +1. `research/chat-experience/phase-10i-memory-architecture-research.md` +2. `research/chat-experience/phase-10i-transcript-search-research.md` + +## Questions + +1. Does this implement a production-safe transcript recovery path after Murph compaction? +2. Is the current session id injected without bloating provider context? +3. Is the agent tool bounded, cited, and redacted enough? +4. Could it leak magic links, API keys, raw base64, attachment payloads, or hidden runtime records? +5. Are query, role, seq window, and limit behavior correct? +6. Does the implementation misuse Murph or Pi instead of building Phantom-specific glue? +7. Are there P0, P1, or P2 issues that must be fixed before commit? +8. Are there lower-priority follow-ups that should be documented but not block this slice? + +## Required Verification + +Run or inspect results for: + +- `bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts src/agent/__tests__/reflective-transcript-tools.test.ts` +- `bun run lint` +- `bun run typecheck` + +## Output + +Write a concise review report to: + +`/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-review.md` + +Use severity labels P0, P1, P2, P3. If there are no P0/P1/P2 findings, say that clearly. Do not edit production code. Do not commit. + +ultrathink. This is a trust boundary. Be skeptical. diff --git a/prompts/phase-10i-transcript-search-research.md b/prompts/phase-10i-transcript-search-research.md new file mode 100644 index 00000000..37b25f1e --- /dev/null +++ b/prompts/phase-10i-transcript-search-research.md @@ -0,0 +1,69 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product manager at Anthropic, all three at once. You are researching historical transcript search for Phantom chat after Murph compaction. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No "v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Mission + +Design the smallest production-ready mechanism that lets the agent access historical chat transcript details that may have been compacted out of the provider context. + +Write your report to: + +`/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-search-research.md` + +Do not edit application code. + +## Required Reading + +Read these files directly: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +4. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10g-pi-continuity.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-phantom-chat-review.md` +6. `/Users/truffle/work/phantom-murph-hardening/src/chat/message-store.ts` +7. `/Users/truffle/work/phantom-murph-hardening/src/chat/event-log.ts` +8. `/Users/truffle/work/phantom-murph-hardening/src/chat/continuity-context.ts` +9. `/Users/truffle/work/phantom-murph-hardening/src/chat/run-timeline.ts` +10. `/Users/truffle/work/phantom-murph-hardening/src/chat/http.ts` +11. `/Users/truffle/work/phantom-murph-hardening/src/chat/http-handlers.ts` +12. `/Users/truffle/work/phantom-murph-hardening/src/db/schema.ts` +13. `/Users/truffle/work/phantom-murph-hardening/src/agent/in-process-reflective-tools.ts` +14. `/Users/truffle/work/murph/packages/core/src/substrate/pi-harness.ts` +15. `/Users/truffle/work/murph/packages/core/src/query/query.ts` + +Also inspect tests under `src/chat/__tests__` and `src/agent/__tests__`. + +## Questions To Answer + +1. What durable data has enough fidelity to reconstruct pre-compaction history? +2. Is SQLite search enough for the first slice, or should we use Qdrant memory? +3. Should transcript lookup be an agent tool, an injected continuity summary, a user-visible API, or all three? +4. How should the tool avoid exposing secrets, login links, raw attachment payloads, and large outputs? +5. What query/filter shape is enough: current session only, all sessions, date windows, roles, sequence ranges, artifact-only? +6. How should the agent know the tool exists and when to use it? +7. How do we test that post-compaction follow-up can recover a detail from earlier transcript? +8. What is the smallest implementation slice that is valuable and low-risk? + +## Design Constraints + +- Compaction should touch provider context only. It must not close tools, MCP clients, or transcript search capability. +- The search mechanism must work after reload and process restart. +- Search results should be compact, cited by session id and seq, and safe to paste into the model context. +- Do not create a full vector-search system if SQLite plus existing memory is enough for this slice. +- Do not force the user to manually search if the agent can use a tool. + +## Deliverable Format + +Write: + +1. Executive recommendation. +2. Current persistence model. +3. Proposed transcript search contract. +4. Safety/redaction model. +5. Test plan. +6. Builder slice proposal with exact files likely touched. + +ultrathink. This is a product trust feature. Get the boundary right. diff --git a/research/chat-experience/phase-10i-chat-detail-research.md b/research/chat-experience/phase-10i-chat-detail-research.md new file mode 100644 index 00000000..fb3c0f7d --- /dev/null +++ b/research/chat-experience/phase-10i-chat-detail-research.md @@ -0,0 +1,502 @@ +# Phase 10I Chat Detail Research + +Date: 2026-05-01 + +Scope: research only. No application code changes. + +## Executive Recommendation + +The next best slice is not a broad redesign. It is a trust slice: make the +current run feel attached to the current user request, make collapsed tool cards +carry real product meaning, and turn Phantom pages/files into durable chat +objects instead of raw tool logs. + +Recommended PR order: + +1. **PR 1: Live run presence plus structured collapsed tool summaries.** Keep + the existing calm visual language, but move the active run surface from a + detached row after the transcript into the current turn. Add normalized tool + identity and safe collapsed facts to the durable timeline and client state. + The immediate win is that a long Bash, preview, compaction, reconnect, or + subagent run never looks idle. +2. **PR 2: First-class page/file artifact cards plus markdown polish.** Use the + page metadata already returned by `phantom_create_page` and + `phantom_preview_page` to render inline artifacts with open/copy/preview + affordances. Add explicit markdown table/list/code/link handling so generated + reports and URLs look intentional. + +Wait on a full artifact drawer, full memory explorer inside chat, transcript + search, arbitrary workspace file preview, and provider reasoning summaries + until the smaller contracts below are stable. Those features are right, but + they should not block the live trust slice. + +Important source updates since Phase 10H: + +- Assistant text blocks are no longer dropped. `AssistantMessage` now renders + every assistant text block through `getAssistantTextBlocks`. +- User attachments now stream, persist, reload, and render as chips in the user + message. +- Upload failures now block send instead of silently sending without intended + files. +- SDK terminal errors now avoid committing a successful empty assistant row. + +Those fixes mean Phase 10I should focus on product depth, not re-solving the +Phase 10H integrity bugs. + +## Current UI Findings By Component + +### MessageList + +`chat-ui/src/components/message-list.tsx` renders saved run timelines under the +message that owns them, but the live `runActivity` still renders after all +messages as a standalone row. This is visible in the Phase 10H screenshots: the +completed activity reads as a separate object below the answer rather than as +the state of the turn. + +Fix direction: + +- While there is no assistant content yet, visually attach the active run strip + to the latest user message. +- Once `message.assistant_start` or text starts, make the strip the header of + the assistant run. +- Preserve the existing live elapsed timer and facts, but improve labels from + `Using phantom_create_page...` to product labels like `Creating page`, + `Previewing page`, `Running Bash`, `Reading file`, `Compacting context`, and + `Reconnected`. + +### RunActivityRow + +`run-activity-row.tsx` has the right primitives: status icons, elapsed time, +compaction facts, rate-limit facts, MCP server readiness, subagent facts, and +tool cards. The weakness is hierarchy and label quality. + +Current issues: + +- The row is visually similar to another transcript message, not the current + run state. +- `activityFacts` can show useful counts, but completed rows still need a more + scan-friendly summary such as `4 tools`, `1 page`, `1 preview`, `20s`. +- The vertical rule and nested cards create a heavy left rail when replayed + under a short assistant answer. +- Long-running quiet tools need a heartbeat line such as `Last update 18s ago` + once a running tool has no new output for a threshold. This should be derived + from events and timestamps, not invented agent prose. + +### ToolCallCard + +`tool-call-card.tsx` is correctly collapsed by default and auto-expands only for +error and blocked states. That is the right default. The missing piece is the +collapsed information model. + +Current issues: + +- Tool identity is ad hoc. Only Claude-style built-ins receive good subtitles. + Phantom-native MCP tools, external MCP tools, and generic tools often collapse + to weak labels. +- The MCP server detection is string-based in the client and translator. It + does not model origin, server, capability kind, or display name. +- `phantom_create_page` and `phantom_preview_page` already return metadata, but + collapsed cards do not show title, URL, path, size, preview status, console + count, failed request count, or screenshot availability. +- `fullRef` renders as inert text. It should remain safe and redacted, but it + needs an explicit copy action and a clear label. +- Expanded output is still a wall of text. It should be structured into + Parameters, Output, Artifacts, Errors, Redactions, and Full output. + +Collapsed cards should show: + +- Origin: Phantom, Shell, Files, Web, Memory, External MCP. +- Subject: command, path, URL, title, query, memory type, or artifact filename. +- State: running, completed, blocked, errored, stopped. +- Timing: elapsed while running, duration when complete. +- Result facts: output truncated, full output saved, issue counts, file size, + match count, or generated artifact count. +- Actions when safe: open page, copy URL, copy path, copy output reference, + expand details. + +### AssistantMessage + +Current `AssistantMessage` renders all text blocks, but it still groups +thinking blocks first, then tool calls, then text blocks. Pi web UI renders +assistant chunks in source order, including text, thinking, and tool calls. + +Fix direction: + +- Do not reintroduce the hidden-text bug. Keep every text block visible. +- In a later slice, preserve interleaving by representing assistant content, + thinking blocks, and tool calls in one ordered render stream. +- Keep private reasoning hidden. The current `ThinkingBlock` duration support + is useful, but redacted thinking should remain clearly labeled as hidden. + +### UserMessage + +User attachment chips are now present and durable. The remaining polish is +ergonomic: + +- Long filenames truncate without an explicit copy/open icon. +- User request cards are still bubble-like and use a stronger tint than the + rest of the chat surface. +- User messages do not get a copy affordance, while assistant messages do. +- File-only sends are blocked by the composer because send requires non-empty + text. That can wait, but it is a natural file workflow. + +### ChatInput + +The composer is functional and now blocks failed upload sends. Remaining polish: + +- Uploading is represented by attachment tile spinners, but the composer itself + has no upload-progress or upload-error state. +- Remove buttons on pending attachment tiles are hover-only. Touch users can + miss them. +- The shell header, sidebar, composer, and card borders all use the same border + tone, which makes the layout stack feel heavier than the product needs. +- The toolbar has only attachments. Memory, transcript search, and artifact + controls should eventually be explicit controls, not hidden commands. + +### Markdown And Code + +`markdown.tsx` uses `react-markdown`, `remark-gfm`, and `rehype-sanitize`, but +only customizes code and links. `package.json` does not include +`@tailwindcss/typography`, so the `prose` classes may not provide the intended +table/list/blockquote styling. + +Needed polish: + +- Explicit components for tables, lists, blockquotes, horizontal rules, links, + generated `/ui/...` URLs, and code blocks. +- Tables in a horizontal overflow container with tabular numerics. +- Code copy visible on focus and touch, not only hover. +- Optional wrap toggle for long code and logs. +- Link cards for generated Phantom pages when the URL is known to be safe. + +### Chat Activity And Store + +`chat-activity.ts` and `chat-store.ts` have a strong foundation for live and +replayed activity. They already support attachments, thinking duration, +compaction, rate limits, subagents, MCP readiness, stream replay, caught-up +state, and durable run timelines. + +Missing state shape: + +- Tool origin and capability kind. +- Artifact summaries attached to tool summaries. +- Safe output references with copy/open affordances. +- Per-tool updated-at timestamps for quiet long-running tools. +- Provider/model reasoning capability labels. Show `Reasoning hidden`, + `Reasoned for 8s`, or `Provider did not expose reasoning`; do not show raw + private thinking. + +### Durable Run Timeline + +`src/chat/run-timeline.ts` persists the right high-level run structure, but the +tool summary schema is too generic for the next UI slice. It stores name, state, +input/output summaries, timing, truncation, MCP flags, and block reason, but no +artifact data. + +The key gap is `summarizeToolOutput`: for successful non-empty output it returns +`Tool produced output.`. That is safe, but it discards exactly the metadata the +chat UI needs for Phantom-native page tools. + +Add a structured, allowlisted artifact summary contract for known Phantom tools: + +- `phantom_create_page`: title, path, URL, size, created/updated state. +- `phantom_preview_page`: title, path, HTTP status, screenshot reference or + screenshot availability, console issue count, failed request count, viewport. +- File-producing tools when safe: path, filename, extension, byte count, action. + +Do not parse arbitrary external MCP JSON into artifacts. Start with +Phantom-native tools and strictly validate shape. + +### UI Tools And Preview + +`src/ui/tools.ts` returns exactly the metadata the chat needs for created pages: +`created`, `path`, `url`, `size`, and a note that the URL is not a login link. + +`src/ui/preview.ts` returns a PNG image block plus JSON metadata with HTTP +status, title, console messages, and failed network requests. Today that is +useful to the agent, but the chat UI should also use the safe summary. + +Fix direction: + +- Keep full image bytes out of durable chat summaries unless a bounded storage + reference exists. +- Store counts and status in the tool summary. +- Render the screenshot thumbnail only through a safe reference or live frame, + not by dumping base64 into the transcript. + +### Pi Web UI Patterns To Borrow + +Pi has three patterns worth borrowing, not porting wholesale: + +- `ChatPanel` automatically opens an artifact panel when new artifacts are + created, reconstructs artifacts from existing messages, and exposes a compact + floating artifact pill with a count. +- `ArtifactsToolRenderer` uses inline artifact pills in tool headers and keeps + code/log details collapsed by default. +- `ArtifactsPanel` uses tabs for multiple artifacts and per-artifact header + buttons like preview/code toggle, reload, copy, and download. +- `AttachmentTile` opens a file overlay and keeps touch removal discoverable. +- `Messages.ts` renders assistant chunks in source order. + +Phantom should borrow the mental model: inline artifact pills first, a session +artifact rail later, and source-order rendering when the data model supports it. + +## Concrete Next Slices, In Order + +### Slice 1: Live Run Presence + +Goal: the user never wonders if the agent is alive. + +Build: + +- Attach live run activity to the current turn. +- Keep elapsed time live every second. +- Show the current phase within one second of send. +- Improve labels from raw tool names to capability labels. +- Add quiet-tool heartbeat based on last event time. +- Keep replayed timelines visually identical to live timelines after reload. +- Add mobile-safe wrapping and fixed dimensions so labels, badges, and timers do + not resize the transcript. + +Acceptance: + +- A `sleep 12 && pwd` style task shows active state within one second. +- The timer increments while the tool is running. +- The stopped state appears immediately when the user presses stop. +- Reload after completion shows the same run timeline collapsed by default. +- No hidden reasoning or fake progress prose is introduced. + +### Slice 2: Tool Identity And Collapsed Card Summaries + +Goal: a collapsed tool card is useful without expansion. + +Build: + +- Add normalized identity: + `{ origin, serverName, rawName, displayName, capabilityKind }`. +- Add capability mappings for Shell, Files, Search, Web, Phantom pages, + Phantom preview, Scheduler, Memory, Secrets, Browser, and External MCP. +- Add safe primary subject: + command, path, URL, title, query, memory type, or artifact filename. +- Add safe result facts: + duration, output truncated, full output saved, page size, preview status, + console issue count, failed request count, file size, match count. +- Add open/copy buttons only when data is already known and safe. +- Preserve collapsed-by-default behavior for completed cards. Auto-open only + error and blocked cards. + +Acceptance: + +- Bash collapsed card shows command summary, elapsed or duration, and output + saved/truncated if relevant. +- `phantom_create_page` collapsed card shows page title/path and URL action. +- `phantom_preview_page` collapsed card shows HTTP status, title, issue counts, + and screenshot availability. +- External MCP cards show server plus tool, not a generic unknown label. +- Expanded card sections are structured and redacted. + +### Slice 3: First-Class Page And File Artifacts + +Goal: generated files/pages are visible chat objects, not JSON archaeology. + +Build: + +- Add `ArtifactSummary` to durable tool summaries and client state. +- Render inline artifact cards under the relevant tool or assistant message. +- Page artifact card fields: + title, path, URL, size, preview status, console issue count, failed request + count, last preview time, actions. +- File artifact card fields: + filename, extension/type, path, size when known, action, open/copy affordance. +- Use a `Details` expansion for raw parameters and redacted output. +- Keep the future session artifact rail out of this PR unless the inline card + contract proves too cramped. + +Acceptance: + +- Created `/ui/...` pages can be opened from chat without reading JSON. +- The URL can be copied from chat. +- Preview failures show a clear issue count and expandable details. +- Reload preserves artifact cards. +- Login links from `phantom_generate_login` never render as page artifacts. + +### Slice 4: Markdown And Small Interaction Polish + +Goal: generated reports, code, tables, and links look like product output. + +Build: + +- Explicit markdown table/list/blockquote/link/code components. +- Code copy visible on touch and keyboard focus. +- Wrap toggle for code and logs. +- Generated Phantom page URL cards when URL is safe. +- Softer shell boundaries, keep object borders on tool/code/artifact cards. +- Attachment tile error styling and touch-visible remove buttons. + +Acceptance: + +- Tables are readable on desktop and horizontally scroll on mobile. +- Long links do not overflow. +- Copy controls are reachable on touch and keyboard. +- No text overlaps on 390 px mobile width. + +### What Should Wait + +- Full artifact side panel or gallery. +- Full transcript search UI. +- Full memory explorer inside chat. +- Editing artifacts from chat. +- Arbitrary local filesystem preview. +- Provider reasoning summaries until Murph exposes an explicit safe summary + contract. +- Rich syntax highlighting unless it is a small dependency choice with no bundle + or security cost. + +## Icon Recommendations + +Use `lucide-react` consistently. Avoid emojis as state or object primitives. + +- User-visible memory: `BookOpenText`. +- Agent-visible memory context: `Brain` for the cognitive surface, + `Database` for storage/health. +- Transcript search: `Search` for the action, `ScrollText` for transcript + history. +- Artifacts: `PanelsTopLeft` for the artifact surface, `SquareStack` for an + artifact collection, `FileCode2` for code artifacts. +- Files: `File`, `FileText`, `FileCode2`, `Image`, `FileArchive`, `Table`. +- Pages and URLs: `PanelTop` or `Globe2` for pages, `ExternalLink` for open, + `Copy` for copy URL/path. +- Shell and tools: `Terminal` for Bash, `Search` for Grep/WebSearch, + `Wrench` for generic tools. +- Tool states: `Loader2` running, `CheckCircle2` complete, `CircleX` error, + `ShieldAlert` blocked, `AlertCircle` warning, `Square` or `CircleStop` + stopped. +- Progress: `Activity` for run activity, `Clock3` for elapsed time, `Radio` + for live connection, `RotateCw` or `RefreshCw` for reconnect/retry, + `Hourglass` for waiting. +- Preview: `Eye` for preview, `RefreshCw` for re-preview. + +## Visual Acceptance Criteria + +Desktop criteria: + +- At 1440 x 1000, the active run strip is visually attached to the current turn. +- Completed run timelines do not dominate short assistant answers. +- Tool cards stay collapsed by default and fit inside the transcript width. +- Page artifact cards show title/path/action in one scan. +- Expanded details do not show an unstructured JSON wall before the useful + sections. + +Mobile criteria: + +- At 390 x 844, no label, badge, filename, URL, code block, or action row + overlaps adjacent content. +- Tool card headers wrap gracefully and keep state icons visible. +- Attachment remove buttons and code copy buttons are reachable without hover. +- Tables and code scroll horizontally inside their own containers. +- The composer does not jump when attachment status changes. + +State criteria: + +- Empty chat: calm empty state, no marketing layer. +- Loading session: visible loading skeleton or simple text in the transcript + area, not only an inert blank screen. +- Long-running: active phase visible within one second. +- Reconnect/replay: `Replaying recent activity` then `Reconnected`, without + losing the run timeline. +- Error: assistant status and run row clearly show error, with no successful + blank assistant row. +- Blocked: tool card auto-expands and states the safe block reason. +- Compaction: visible `Compacted context and kept working` state with token + fact when available. + +## Playwright Verification Plan + +Use two layers: deterministic visual fixtures for UI states, plus one live smoke +against Phantom-on-Murph. + +### Deterministic Screenshots + +Create Playwright scenarios that mock the session API and SSE frames: + +1. **Long-running live run** + - User message arrives. + - `message.tool_call_start`, `message.tool_call_input_end`, and repeated + `message.tool_call_running` frames arrive. + - Capture at 1 second, 6 seconds, stopped, completed, and after reload. +2. **Tool card matrix** + - Bash running/result/error. + - Read/Write/Edit file tools. + - `phantom_create_page` success. + - `phantom_preview_page` success with console and network issues. + - Blocked tool. + - External MCP tool. + - Capture collapsed default and expanded states. +3. **Artifact cards** + - Page card with URL, path, size, preview status, issue counts, screenshot + availability. + - File card with type, size, path, copy/open actions. + - Reloaded session with durable artifact summaries. +4. **Markdown report** + - Table, long URL, ordered and unordered lists, blockquote, inline code, + fenced code, and generated `/ui/...` URL. + - Capture desktop and mobile. +5. **Attachment states** + - Image, PDF, text file, uploading, error, durable sent chip. + - Capture composer and sent message. +6. **Memory and search controls** + - Only once built. Verify icons, labels, empty state, and no hidden reasoning. + +Required viewports: + +- Desktop: 1440 x 1000. +- Narrow laptop: 1024 x 768. +- Mobile: 390 x 844. + +Assertions: + +- No horizontal page overflow. +- Buttons have accessible names. +- Tool cards are collapsed by default except blocked/error. +- Active elapsed timer changes while running. +- Copy/open buttons are visible on keyboard focus. +- Raw JSON is not visible in collapsed artifact cards. + +### Live Smoke + +After deterministic screenshots pass, run one real local smoke: + +1. Start Phantom locally with Murph and OpenAI. +2. Authenticate through the chat UI. +3. Send a prompt that performs a long Bash or page preview task. +4. Capture before tool start, mid-tool, after completion, and after reload. +5. Create and preview a `/ui/...` page. +6. Verify the page artifact card opens the page and copies the URL. +7. Verify no push-notification 503 noise changes chat behavior. + +The deterministic layer catches visual regressions cheaply. The live smoke +proves real Murph frames still feed the same UI contract. + +## Risks And Edge Cases + +- **Sensitive output:** Tool summaries must remain allowlisted and redacted. + Do not add open actions for arbitrary local paths. +- **Tool JSON variance:** Parse structured artifact data only for known + Phantom-native tools. Arbitrary MCP output should stay generic. +- **Large screenshots:** Store bounded references or counts, not unbounded base64 + in durable timelines. +- **Multiple tabs:** Expansion state and active run attachment must survive + resume/replay without duplicating tool cards. +- **Out-of-order frames:** Tool running/result frames can arrive before start in + recovery paths. The client already has placeholder tools; keep that posture. +- **Mobile density:** Artifact actions can crowd small screens. Use icon buttons + with tooltips and accessible names. +- **Provider differences:** Some providers expose reasoning duration, some hide + it, and some expose no reasoning signal. Label capability truthfully. +- **Compaction:** After compaction, artifact summaries must remain durable enough + for both the user and the agent-visible continuity context. +- **Auth confusion:** `phantom_generate_login` links must never be treated as page + artifacts. Created `/ui/...` URLs and login URLs need distinct labels. +- **Memory sensitivity:** User-visible memory and agent-visible memory are + different surfaces. Show what is stored and editable without exposing hidden + reasoning or private tool output. diff --git a/research/chat-experience/phase-10i-memory-architecture-research.md b/research/chat-experience/phase-10i-memory-architecture-research.md new file mode 100644 index 00000000..6a7420ea --- /dev/null +++ b/research/chat-experience/phase-10i-memory-architecture-research.md @@ -0,0 +1,251 @@ +# Phase 10I Memory Architecture Research + +Date: 2026-05-01 + +## 1. Executive recommendation + +Phantom should treat memory as three product surfaces, not one. + +1. User-visible memory: editable or inspectable knowledge the operator can understand and correct. This is already split between `.claude` markdown files in the Memory files dashboard, Qdrant memories in Memory explorer, and read-only `phantom-config/memory/agent-notes.md`. +2. Agent-visible operational memory: compact, safe continuity facts and targeted retrieval tools that help the agent resume work after Murph compaction. This should live in Phantom, not Murph, because it is based on Phantom chat sessions, tool cards, pages, artifacts, and auth semantics. +3. Hidden runtime records: raw chat stream events, tool payloads, attachments, costs, and transcript storage. These should remain durable and searchable by bounded tools, but not rendered as "memory" and not dumped into prompts. + +The next builder slice should not create a new database. It should extend the existing Phantom reflective in-process MCP server with a chat-history retrieval tool backed by existing SQLite tables, then inject the current chat session id into the existing Murph `transformContext` continuity block. The agent gets a small prompt hint plus a tool it can call only when it needs older compacted details. The user gets no new confusing memory object yet. + +## 2. Current state inventory + +### Prompt assembly and prompt memory + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-assembler.ts`, `assemblePrompt(...)` + builds the system prompt in this order: identity, environment, security, role prompt, onboarding, evolved config, agent memory instructions, general instructions, working memory, Qdrant memory context, and optional chat runtime context. + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-blocks/agent-memory-instructions.ts`, `buildAgentMemoryInstructions()` + teaches the main agent to append durable learnings to `phantom-config/memory/agent-notes.md` with Write or Edit. The file contents are deliberately not injected into the prompt. Tests in `/Users/truffle/work/phantom-murph-hardening/src/agent/__tests__/prompt-assembler.test.ts` assert the path is present, append-only rules are present, and file contents are not injected. + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-blocks/working-memory.ts`, `buildWorkingMemory(dataDir)` + reads `data/working-memory.md` and injects it under `# Working Memory`. It truncates after 75 lines and asks the agent to compact the file. `/Users/truffle/work/phantom-murph-hardening/src/index.ts` seeds this file on startup if it is missing. + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-blocks/instructions.ts`, `buildInstructions()` + also tells the agent to read and update `data/working-memory.md` at the start of every new conversation. In practice, the prompt assembler already injects the file contents, so this is both an instruction and a loaded prompt surface. + +- `/Users/truffle/work/phantom-murph-hardening/src/memory/context-builder.ts`, `MemoryContextBuilder.build(query)` + queries Qdrant-backed episodes, facts, and procedures and formats the results under `# Your Memory`. Facts get first priority, episodes are filtered through `shouldIncludeEpisodeInContext(...)`, and procedures are included if budget allows. Tests in `/Users/truffle/work/phantom-murph-hardening/src/memory/__tests__/context-builder.test.ts` assert readiness checks, formatting, stale episode filtering, budget behavior, and error tolerance. + +### Murph chat continuity + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/continuity-context.ts`, `buildChatContinuityContext(...)` + scans the durable chat stream log, currently up to 5000 recent events, and extracts page artifacts from `phantom_create_page` and `phantom_preview_page`, plus recent `session.compact_boundary` checkpoints. It excludes `/ui/login` auth links and limits output to 8 artifacts and 3 compaction checkpoints. Tests in `/Users/truffle/work/phantom-murph-hardening/src/chat/__tests__/continuity-context.test.ts` assert page extraction, login exclusion, compaction reporting, and tail-limited scan behavior. + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/writer.ts`, `ChatSessionWriter.run(...)` + persists the user message, appends live stream events to `ChatEventLog`, builds `sessionContext` through `buildChatContinuityContext(...)`, and passes it into `AgentRuntime.runForChat(...)`. Tests in `/Users/truffle/work/phantom-murph-hardening/src/chat/__tests__/writer.test.ts` assert page continuity reaches the runtime. + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/chat-query.ts`, `executeChatQuery(...)` + sends `sessionContext` differently by runtime. Anthropic fallback appends it inside the system prompt as `# Current Chat Context`. Murph uses `createMurphContextTransform(...)` so the context becomes a Pi-compatible user message outside the system prompt. + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/murph-context.ts`, `createMurphContextTransform(...)` + injects a single `` message before the latest user message and removes stale Phantom context messages so they do not accumulate. Tests in `/Users/truffle/work/phantom-murph-hardening/src/agent/__tests__/murph-context.test.ts` and `/Users/truffle/work/phantom-murph-hardening/src/agent/__tests__/agent-sdk-boundary-callers.test.ts` assert placement, replacement, and absence from the system prompt on Murph. + +### Agent tools for memory and history + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/in-process-reflective-tools.ts`, `createReflectiveToolServer(memory, db)` + exposes in-process MCP tools to the agent: `phantom_memory_search` and `phantom_list_sessions`. These are available inside Agent SDK queries through `/Users/truffle/work/phantom-murph-hardening/src/index.ts`, where `runtime.setMcpServerFactories(...)` registers `"phantom-reflective"` per query. `phantom_memory_search` returns Qdrant episodes and facts, optionally time-bounded. `phantom_list_sessions` returns rows from the runtime `sessions` table. + +- `/Users/truffle/work/phantom-murph-hardening/src/mcp/tools-universal.ts`, `registerUniversalTools(...)` + exposes external MCP equivalents: `phantom_history`, `phantom_list_sessions`, `phantom_memory_query`, and `phantom_memory_search`. Tests in `/Users/truffle/work/phantom-murph-hardening/src/mcp/__tests__/tools-universal.test.ts` assert alias registration, session filtering by channel and days, memory search recency filtering, and clean unavailable-memory errors. + +Important gap: neither the in-process reflective tools nor the external MCP tools expose the current web chat transcript by `chat_sessions.id`. They list runtime `sessions` rows and search Qdrant memories, but they do not let the agent retrieve earlier chat messages or safe tool timeline details after compaction. + +### User-visible memory UI and APIs + +- `/Users/truffle/work/phantom-murph-hardening/src/ui/api/memory-files.ts`, `handleMemoryFilesApi(...)` + provides cookie-gated CRUD for markdown files under the user-scope `.claude` root and read-only access to allow-listed `phantom-config/memory` files. Writes are audited through `recordMemoryFileEdit(...)`. + +- `/Users/truffle/work/phantom-murph-hardening/src/memory-files/paths.ts` + validates memory file paths. It allows markdown only, blocks hidden files, traversal, `skills/`, `plugins/`, `agents/`, `settings.json`, and `settings.local.json`. It explicitly allow-lists `phantom-config/memory/agent-notes.md` as a read-only virtual path. + +- `/Users/truffle/work/phantom-murph-hardening/src/memory-files/storage.ts` + implements list, read, atomic write, and delete. It limits file content to 256 KB and surfaces `phantom-config/memory/agent-notes.md` as read-only with description "Agent notes (the agent's own learnings, append-only)". + +- `/Users/truffle/work/phantom-murph-hardening/public/dashboard/memory-files.js` + renders the Memory files tab. It describes `.claude` markdown as persistent memory, lets the operator create, edit, save, and delete normal memory files, and renders read-only files as agent-maintained memory. + +- `/Users/truffle/work/phantom-murph-hardening/src/ui/api/memory.ts`, `handleMemoryApi(...)` + provides cookie-gated Memory explorer APIs over Qdrant episodes, facts, and procedures. It supports health, list, search, detail, and delete. `/Users/truffle/work/phantom-murph-hardening/public/dashboard/memory.js` renders this as "Memory explorer" with copy-as-JSON and delete controls. + +Tests in `/Users/truffle/work/phantom-murph-hardening/src/ui/api/__tests__/memory-files.test.ts`, `/Users/truffle/work/phantom-murph-hardening/src/memory-files/__tests__/storage.test.ts`, and `/Users/truffle/work/phantom-murph-hardening/src/ui/api/__tests__/memory.test.ts` define the intended user-visible behavior. + +### Hidden durable transcript and event storage + +- `/Users/truffle/work/phantom-murph-hardening/src/db/schema.ts` + defines `chat_sessions`, `chat_messages`, `chat_stream_events`, and `chat_run_timelines`. These are durable across process restarts and are the right storage base for operational continuity. + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/message-store.ts`, `ChatMessageStore` + stores committed user and assistant transcript rows. User messages store prompt text and attachment metadata, not raw attachment bytes. + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/event-log.ts`, `ChatEventLog` + stores the event stream, including tool calls, tool inputs, outputs, compaction status, errors, and terminal events. This is operational evidence, not user memory. + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/run-timeline.ts` + stores durable run summaries for UI replay. This is a UI timeline surface, not a long-term memory surface. + +## 3. Proposed memory taxonomy + +### For-user memory + +For-user memory means editable, inspectable, human-trustable knowledge. It should render in the dashboard and be safe for the operator to correct. + +Primary user memory surfaces: + +- Memory files: `.claude/**/*.md` through `/ui/api/memory-files` and `public/dashboard/memory-files.js`. +- Agent notes: `phantom-config/memory/agent-notes.md`, read-only in the dashboard because the agent appends to it directly. +- Memory explorer: Qdrant episodes, facts, and procedures through `/ui/api/memory` and `public/dashboard/memory.js`, with delete as the correction mechanism. +- Evolved config files such as `user-profile.md`, `domain-knowledge.md`, and strategy files, managed by the reflection subprocess and visible elsewhere in the dashboard. + +Product rule: if a user can reasonably ask "why did Phantom remember this about me?", it belongs in a user-visible memory surface or should be traceable to one. + +### For-agent memory + +For-agent memory means operational continuity that helps Phantom do the next step without asking the user to repeat context. It is not necessarily a user-facing belief. It should be compact, retrievable, scoped, and auditable. + +It should include: + +- Current chat session id. +- Recent committed user and assistant messages. +- Safe run timeline summaries. +- Page and artifact references. +- Compaction checkpoints. +- Tool call names, statuses, paths, URLs, sizes, and short previews. +- Retrieval handles for older transcript slices. + +It should not include: + +- Raw tool full outputs by default. +- Raw screenshots, base64, binary blobs, or full HTML dumps. +- Credential values, auth tokens, magic-link secrets, or `.env` contents. +- Provider private thinking or chain-of-thought. +- Hidden internal tool protocol details unless explicitly needed for recovery. + +Where it should live: in Phantom's SQLite chat tables and in Phantom-owned in-process MCP tools. Murph should remain generic and only supply the transform and compaction seam. + +### Hidden runtime records + +Hidden runtime records are durable evidence and replay data. They can feed tools and summaries, but they should not be labeled as memory in the UI. + +Examples: + +- `chat_stream_events.payload_json` +- full tool inputs and outputs +- attachment storage paths and hashes +- `chat_run_timelines.summary_json` +- SDK session ids +- token counts and cost rows + +Product rule: hidden runtime records can be used by the agent for continuity through safe extractors, but users should see curated views, not raw internal logs. + +## 4. Proposed agent prompt and tool contract + +The prompt should stay small. The agent should receive: + +1. Normal system prompt memory from `assemblePrompt(...)`. +2. Qdrant recall from `MemoryContextBuilder.build(...)`, bounded by memory config. +3. Current chat continuity from `buildChatContinuityContext(...)`, injected by `createMurphContextTransform(...)` on Murph. +4. A new line in the chat continuity block containing the current `chat_sessions.id`. +5. A new in-process MCP tool for bounded transcript recovery. + +Recommended new tool: + +`phantom_chat_history` + +Input: + +- `session_id`: current chat session id, required. +- `query`: optional search text. +- `limit`: default 10, max 50. +- `before_seq`: optional transcript cursor. +- `include_tool_events`: optional, default false. + +Default output: + +- session metadata from `chat_sessions` +- committed `chat_messages` rows as role, seq, created_at, text excerpt, attachment metadata summary, token and cost metadata +- page artifacts and compaction checkpoints from `buildChatContinuityContext(...)` or a shared parser extracted from it + +When `include_tool_events` is true: + +- tool call id, tool name, status, stream seq, short input summary, short output preview +- never full output by default +- never raw base64 +- never login magic links + +Prompt language should say: "If a needed detail is missing after compaction, call `phantom_chat_history` with the current chat session id. Do not ask the user to repeat context until that lookup fails." + +This is better than injecting more transcript into every prompt. It preserves post-compaction continuity while keeping prompt size bounded and putting sensitive records behind explicit retrieval. + +## 5. Proposed user-visible UI and API contract + +No new user-visible "for-agent memory" UI should ship in the smallest slice. The existing user-facing contract should stay: + +- Memory files are editable user-authored markdown, except read-only surfaced agent notes. +- Memory explorer shows Qdrant memories and lets users inspect, copy, search, and delete them. +- Chat timeline shows work done in a session, including tool cards and artifacts. + +Later, the right user-visible addition is not a new memory tab. It is a session-level "Context used" or "Recovered context" affordance in chat that shows when Phantom used compacted context or `phantom_chat_history`, with safe high-level rows such as: + +- "Recovered earlier page artifact: /ui/profile.html" +- "Looked up 6 earlier messages in this chat" +- "Used Qdrant fact: user prefers PRs over direct pushes" + +The UI should not show raw hidden tool output or provider private reasoning. If a user wants to inspect raw transcript, that should be an advanced export or admin-debug surface with redaction, not the default memory UI. + +## 6. Risks and privacy boundaries + +Key risks: + +- Prompt bloat: adding full transcript snippets to `buildChatContinuityContext(...)` would grow every turn. Prefer a retrieval tool. +- Sensitive leakage: `chat_stream_events` can contain tool input/output, page HTML, screenshots, file paths, auth links, and error details. Tool output must redact or omit unsafe fields. +- User confusion: calling operational transcript recovery "memory" will make users think Phantom believes those details forever. Keep operational continuity separate from user memory. +- Self-reinforcement: injecting `agent-notes.md` on every run would make the agent treat its own append log as canonical. Existing tests correctly prevent this. +- Cloud durability: SQLite chat tables are durable across process restarts on the VM today. Future cloud must preserve the same database or migrate these records to managed durable storage before relying on the tool. +- Access control: in-process tools are agent-visible. External MCP tools need existing bearer scopes and should not gain raw chat-history access by default. + +Hard boundaries: + +- Do not expose raw chain-of-thought or provider private thinking. +- Do not store raw base64, credentials, or unredacted full tool outputs in user memory. +- Do not show `phantom_generate_login` magic links as artifacts. +- Do not let the user-editable memory file API write into `phantom-config/memory/agent-notes.md`. +- Do not move Phantom chat history retrieval into Murph. Murph should stay a generic runtime. + +## 7. Smallest next builder slice + +Build a Phantom-only "bounded current chat history retrieval" slice. + +Likely files touched: + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/continuity-context.ts` + include the current chat session id in the rendered continuity context and extract shared safe artifact parsing helpers if useful. + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/in-process-reflective-tools.ts` + add `phantom_chat_history` backed by `chat_sessions`, `chat_messages`, and optionally `chat_stream_events`. Return bounded excerpts and safe tool summaries. Redact `/ui/login` URLs, token-like values, and base64-looking blocks. + +- `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-assembler.ts` or `/Users/truffle/work/phantom-murph-hardening/src/agent/prompt-blocks/instructions.ts` + add one short instruction telling the agent to use `phantom_chat_history` before asking the user to repeat compacted details. Keep it under about 80 words. + +- `/Users/truffle/work/phantom-murph-hardening/src/chat/writer.ts` + likely no behavior change beyond receiving the updated continuity context. Touch only if the tool needs additional session metadata. + +Tests required: + +- Add tests in `/Users/truffle/work/phantom-murph-hardening/src/chat/__tests__/continuity-context.test.ts` proving the context includes the current chat session id and still excludes auth links. +- Add tests in `/Users/truffle/work/phantom-murph-hardening/src/agent/__tests__/murph-context.test.ts` or `/Users/truffle/work/phantom-murph-hardening/src/agent/__tests__/agent-sdk-boundary-callers.test.ts` proving the session id reaches Murph through `transformContext`, not the system prompt. +- Add tests for `phantom_chat_history` in a new or existing in-process reflective tools test, with cases for message retrieval, query filtering, limit enforcement, safe tool summaries, base64 elision, and magic-link redaction. +- Keep existing memory-files tests unchanged, especially read-only `agent-notes.md`. +- Run focused tests for chat continuity, Murph context, agent SDK boundary callers, in-process reflective tools, memory-files, and MCP universal tools, then the normal `bun test`, `bun run typecheck`, and `bun run lint` gate. + +Acceptance criteria: + +- After Murph compaction, the prompt contains only compact Phantom context plus a current chat session id. +- The agent can call `phantom_chat_history` to recover bounded prior details from the same chat. +- The tool never returns raw base64, credentials, login magic links, or full unbounded tool outputs. +- No user-visible memory UI changes are required for this slice. +- Murph remains generic. All Phantom chat-history semantics stay in Phantom. diff --git a/research/chat-experience/phase-10i-transcript-recovery-final-re-review.md b/research/chat-experience/phase-10i-transcript-recovery-final-re-review.md new file mode 100644 index 00000000..5573bea3 --- /dev/null +++ b/research/chat-experience/phase-10i-transcript-recovery-final-re-review.md @@ -0,0 +1,36 @@ +# Phase 10I Transcript Recovery Final Re-review + +Date: 2026-05-01 + +## Findings + +No P0, P1, or P2 findings. + +The previous P2, attachment metadata bypassing transcript redaction, is resolved. +I did not find a new P0, P1, or P2 issue introduced by the narrow fix. + +## Evidence + +File inspection: + +- `src/chat/transcript-search.ts:225` through `src/chat/transcript-search.ts:236` now redacts both attachment metadata fields returned to callers. `filename` is passed through `redactSensitiveText(filename)`, and `mime_type` is passed through `redactSensitiveText(mimeType)`. +- `src/chat/transcript-search.ts:208` through `src/chat/transcript-search.ts:211` builds the attachment snippet from the same sanitized attachment summary, so snippet text and returned metadata share the same redacted values. +- `src/chat/__tests__/transcript-search.test.ts:144` through `src/chat/__tests__/transcript-search.test.ts:170` adds a focused regression test for the prior leak. It verifies the redacted snippet, the redacted `attachments` array, and absence of the raw AWS-shaped filename, raw metadata secret, and raw attachment payload marker. +- `src/chat/redaction.ts` remains the shared helper used by transcript search and durable timeline redaction. +- Current related diff was inspected, including transcript search, redaction, reflective transcript tools, MCP factory context plumbing, continuity context, and run timeline redaction reuse. + +Additional adversarial check: + +- I ran an in-memory transcript search with an attachment filename of `AKIAABCDEFGHIJKLMNOP.pdf`, a MIME-like metadata value of `application/x-api-key=raw-secret`, and a raw attachment payload marker of `RAW_BASE64_PAYLOAD`. +- The returned JSON contained `"[REDACTED_AWS_KEY].pdf"` and `"application/x-api-key=[REDACTED]"` in both the snippet path and the `attachments` array. +- The returned JSON did not contain `AKIAABCDEFGHIJKLMNOP`, `raw-secret`, or `RAW_BASE64_PAYLOAD`. + +## Verification + +- `bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts src/agent/__tests__/reflective-transcript-tools.test.ts src/chat/__tests__/run-timeline.test.ts src/agent/__tests__/agent-sdk-boundary-callers.test.ts src/agent/__tests__/murph-context.test.ts`: pass, 42 tests, 0 fail. +- `bun run lint`: pass, Biome checked 389 files with no fixes applied. +- `bun run typecheck`: pass, `tsc --noEmit`. + +## Verdict + +The final narrow re-review passes. The attachment metadata redaction gap is closed, the regression coverage directly exercises the former leak, and the required verification commands are green. diff --git a/research/chat-experience/phase-10i-transcript-recovery-implementation.md b/research/chat-experience/phase-10i-transcript-recovery-implementation.md new file mode 100644 index 00000000..348ea8e3 --- /dev/null +++ b/research/chat-experience/phase-10i-transcript-recovery-implementation.md @@ -0,0 +1,90 @@ +# Phase 10I Transcript Recovery Implementation + +Date: 2026-05-01 + +## Goal + +Make Phantom on Murph able to recover compacted current-chat facts without +turning durable transcript history into long-term user memory or leaking raw +structured payloads back into provider context. + +## Shipped Behavior + +1. Phantom now injects a compact current-chat context block on every web chat + run. The block includes the current Phantom chat session id and instructs + the agent to call `phantom_chat_transcript_search` when an older detail is + missing after Murph compaction. +2. The in-process `phantom-reflective` MCP server now exposes + `phantom_chat_transcript_search` only inside a bound Phantom web chat run. + The tool rejects unbound calls and rejects any `session_id` other than the + current chat session id. +3. The runtime MCP factory seam now receives lightweight run context: + `sessionKey`, `channelId`, `conversationId`, and `chatSessionId` for web + chat runs. Existing no-argument factories remain compatible. +4. Transcript search reads committed `chat_messages` rows from SQLite with + query, role, sequence window, and limit controls. Results include + `session_id`, `seq`, `role`, `created_at`, `status`, citation, snippet, and + attachment metadata. +5. Transcript search respects both `chat_sessions.status != 'deleted'` and + the real `deleted_at IS NULL` soft-delete lifecycle. +6. Transcript snippets use a shared chat redaction helper also used by durable + run timelines. It redacts magic links, auth headers, secret-like query + parameters, AWS keys, private keys, cookie/header secrets, and large or + line-wrapped base64 blobs. +7. Transcript extraction is allow-list based. Known text and attachment blocks + are summarized. Unknown structured objects return + `[structured content omitted]` instead of raw JSON. +8. Attachment metadata returned by transcript search is redacted through the + same helper, so credential-shaped filenames and MIME labels cannot bypass + snippet redaction. + +## Files Changed + +- `src/agent/mcp-server-factory.ts` +- `src/agent/runtime.ts` +- `src/agent/chat-query.ts` +- `src/agent/in-process-reflective-tools.ts` +- `src/chat/continuity-context.ts` +- `src/chat/transcript-search.ts` +- `src/chat/redaction.ts` +- `src/chat/run-timeline.ts` +- `src/chat/__tests__/continuity-context.test.ts` +- `src/chat/__tests__/transcript-search.test.ts` +- `src/agent/__tests__/reflective-transcript-tools.test.ts` + +## Reviewer Findings Addressed + +1. P2 soft-delete and arbitrary session search: + fixed by adding `deleted_at IS NULL` to the transcript query guard and + threading current web chat session id into the reflective MCP tool factory. + The tool now rejects mismatched or unbound session ids before querying. +2. P2 redaction and structured extraction: + fixed by moving timeline redaction into `src/chat/redaction.ts`, extending it + for transcript needs, and replacing raw unknown-object stringification with + an omission sentinel. +3. P2 attachment metadata redaction: + fixed by redacting returned `filename` and `mime_type` fields in + transcript attachment summaries. Final re-review found no P0, P1, or P2 + findings. + +## Verification + +- `bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts src/agent/__tests__/reflective-transcript-tools.test.ts src/chat/__tests__/run-timeline.test.ts src/agent/__tests__/agent-sdk-boundary-callers.test.ts src/agent/__tests__/murph-context.test.ts`: pass, 42 tests. +- `bun run lint`: pass. +- `bun run typecheck`: pass. +- `bun test`: pass, 2112 pass, 10 skip, 1 todo, 0 fail. +- `cd chat-ui && bun test`: pass, 27 tests. +- `cd chat-ui && bun run typecheck`: pass. +- `cd chat-ui && bun run build`: pass with existing font resolution and chunk + size warnings. + +## Remaining Follow-ups + +1. Query search still scans only the newest 2000 candidate transcript rows. + This is acceptable for the first recovery primitive but should become a + cursor or FTS strategy if real conversations exceed the bound often. +2. Page and file artifact facts should become durable first-class records so + links survive event-log sweeps without relying on transcript search. +3. The next UI slice should default tool cards collapsed after completion, + keep active tool cards readable while running, and expose a clearer run + timeline for long-running tasks. diff --git a/research/chat-experience/phase-10i-transcript-recovery-re-review.md b/research/chat-experience/phase-10i-transcript-recovery-re-review.md new file mode 100644 index 00000000..79f66652 --- /dev/null +++ b/research/chat-experience/phase-10i-transcript-recovery-re-review.md @@ -0,0 +1,84 @@ +# Phase 10I Transcript Recovery Re-review + +Date: 2026-05-01 + +## Findings + +### P2: Attachment metadata bypasses transcript redaction + +Files: + +- `src/chat/transcript-search.ts:85` +- `src/chat/transcript-search.ts:208` +- `src/chat/transcript-search.ts:228` + +The prior redaction finding is mostly fixed for snippet text, unknown structured +objects, private keys, AWS keys, cookies, magic links, auth query values, and +large or line-wrapped base64. However, transcript output still includes the +`attachments` metadata array from `attachmentSummary` without applying the +shared redaction helper to `filename` or `mime_type`. + +I verified this with an adversarial local check. A transcript attachment named +`AKIAABCDEFGHIJKLMNOP.pdf` produced a redacted snippet: +`[attachment: [REDACTED_AWS_KEY].pdf application/pdf] alpha`, but the same tool +result also returned: + +```json +"attachments": [ + { + "filename": "AKIAABCDEFGHIJKLMNOP.pdf", + "mime_type": "application/pdf", + "size_bytes": 1 + } +] +``` + +Impact: credential-shaped values can still leak through the transcript recovery +tool output even though the snippet path is redacted. This violates the +transcript trust boundary because the full JSON tool result is intended to be +pasted back into provider context. + +Suggested fix: apply `redactSensitiveText` to attachment metadata fields before +returning them, at minimum `filename` and `mime_type`, and add a regression test +that proves credential-shaped attachment metadata is redacted in both the +snippet and the `attachments` array. + +## Previous P2 Re-review + +1. Soft-delete lifecycle: fixed for the real `ChatSessionStore.softDelete` path. + `searchChatTranscript` now guards on `chat_sessions.deleted_at IS NULL`, and + the test uses `new ChatSessionStore(db).softDelete("chat-1")`. +2. Current-session restriction: fixed at the MCP tool layer. The reflective tool + receives `currentChatSessionId` from the chat MCP factory context and rejects + mismatched or unbound `session_id` values before querying. +3. MCP factory context: preserved for existing factories. The new factory + signature accepts optional context, and existing zero-argument factories + remain valid. The main runtime path still passes MCP servers through, and the + focused boundary test passes. +4. Structured extraction: fixed for unknown objects and attachment payloads. + Unknown structured objects now return `[structured content omitted]`, and + attachments are summarized instead of returning payload bytes. The metadata + redaction gap above remains. +5. Shared redaction helper and durable timeline behavior: preserved by the + focused run timeline tests. +6. Tests: adequate for the original soft-delete path, bound-session rejection, + known snippet redaction, unknown structured omission, and durable timeline + redaction. Missing coverage for redaction of attachment metadata in the + returned `attachments` array. + +## Verification + +- `bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts src/agent/__tests__/reflective-transcript-tools.test.ts src/chat/__tests__/run-timeline.test.ts src/agent/__tests__/agent-sdk-boundary-callers.test.ts src/agent/__tests__/murph-context.test.ts`: pass, 41 tests, 0 fail. +- `bun run lint`: pass, Biome checked 389 files with no fixes applied. +- `bun run typecheck`: pass, `tsc --noEmit`. +- Additional adversarial check with `bun -e`: confirmed credential-shaped + attachment filenames are redacted in `snippet` but leaked raw in the returned + `attachments` array. + +## Verdict + +No P0 or P1 findings. + +One P2 finding remains. The previous soft-delete and bound-session P2 is fixed, +but the redaction P2 is not fully resolved because attachment metadata bypasses +redaction. diff --git a/research/chat-experience/phase-10i-transcript-recovery-review.md b/research/chat-experience/phase-10i-transcript-recovery-review.md new file mode 100644 index 00000000..77bfa0fb --- /dev/null +++ b/research/chat-experience/phase-10i-transcript-recovery-review.md @@ -0,0 +1,69 @@ +# Phase 10I Transcript Recovery Review + +Date: 2026-05-01 + +## Summary + +No P0 or P1 findings. + +I found two P2 findings that should be fixed before commit. The slice is pointed in the right direction: it keeps Murph generic, injects only a small current chat context, returns per-entry `session_id` and `citation` fields, and gives the agent a bounded SQLite-backed transcript tool. The current implementation is not yet production-safe because the tool does not fully enforce the chat session lifecycle and its redaction/extraction path can still expose credential-shaped or raw structured payloads. + +## Findings + +### P2: Transcript search misses the real soft-delete lifecycle and trusts any supplied active chat session id + +Files: + +- `src/chat/transcript-search.ts:108` +- `src/agent/in-process-reflective-tools.ts:180` + +The latest implementation adds an `EXISTS` guard for `chat_sessions.status != 'deleted'`, and the new unit test covers that status value. That does not match the actual app delete path. `ChatSessionStore.softDelete` sets `deleted_at` and leaves `status` unchanged, while normal session reads and lists filter on `deleted_at IS NULL`. A local check against the current code showed a soft-deleted session still returns transcript rows through `searchChatTranscript`. + +The MCP tool also accepts an arbitrary `session_id` string from the model and passes it directly into the search service. If the agent has or recovers another active chat id, it can search that transcript even though the tool contract says to use the current chat session id. + +Impact: a soft-deleted chat remains searchable until hard deletion. Unrelated active chat ids are also not rejected. This violates the research requirement to respect deleted sessions and weakens the "current session" boundary. + +Suggested fix: change the session guard to require `chat_sessions.deleted_at IS NULL`, and keep or add the status check if status-level deletion is still meaningful. Ideally pass the current chat session id into the in-process transcript tool factory for chat runs and reject mismatched session ids unless a future explicitly privileged all-session scope is added. + +### P2: Redaction and structured extraction are too narrow for a transcript recovery trust boundary + +Files: + +- `src/chat/transcript-search.ts:59` +- `src/chat/transcript-search.ts:192` +- `src/chat/transcript-search.ts:234` + +The redaction patterns cover the tested magic link, `sk-` token, bearer/basic auth, a few key names, and long contiguous base64. They miss common secrets already handled elsewhere in the repo, including AWS access keys, PEM private keys, and line-wrapped base64. A quick local check showed `redactSensitiveText("AWS AKIAABCDEFGHIJKLMNOP")` and a small PEM private key block are returned unchanged. + +The extractor also falls back to `stableStringify(value)` for any unrecognized object. That means a future or malformed persisted content object with fields such as `source.data`, attachment bytes, secret fields, or hidden protocol payloads can be dumped wholesale into tool output. + +Impact: the agent tool can leak API keys, private keys, raw base64, or attachment-like payloads despite the tool description promising safe redaction. This is especially risky because the output is intended to be pasted back into provider context. + +Suggested fix: move the stronger redaction logic from `src/chat/run-timeline.ts` into a shared helper and use it here. Add PEM, AWS key, cookie, wrapped base64, and auth-query coverage. Change extraction to an allow-list: known text fields and attachment summaries only, with unknown objects summarized as `[structured content omitted]` instead of raw JSON. + +## Lower Priority Follow-ups + +### P3: Query search is bounded but may miss older compacted details + +When `query` is present, the service scans only the latest 2000 candidate rows in descending seq order. That is a reasonable safety bound for the first slice, but a very long chat can still fail to recover the exact older detail that compaction removed. Consider documenting this limit in the tool response or adding a cursor strategy. + +### P3: Test gaps + +The focused tests now cover per-entry citations and `status = 'deleted'`, but not the real `ChatSessionStore.softDelete` path. They also do not cover mismatched session ids, private keys, AWS keys, line-wrapped base64, unknown structured content, or the full chat query boundary proving the transcript tool remains available on the Murph path while the session id reaches `transformContext`. + +## Review Questions + +1. Production-safe transcript recovery after Murph compaction: partially implemented, but not until the two P2 findings are fixed. +2. Current session id without provider bloat: yes for Murph. The context is small and goes through `transformContext`, not the system prompt. Non-Murph fallback still appends the same small block to the prompt. +3. Agent tool bounded, cited, redacted enough: bounded mostly yes. Cited yes for committed message rows through `chat:#msg:`. Redacted no. +4. Leakage risk: magic links in the tested form are redacted. API keys, private keys, wrapped base64, and unknown structured payloads can leak. +5. Query, role, seq window, and limit behavior: generally correct. Role and seq filters are exclusive as described. Limits clamp to 1 through 50. Query is simple case-insensitive phrase or all-token matching over the newest bounded candidate rows. +6. Murph or Pi misuse: no. The slice keeps Phantom chat semantics in Phantom and uses Murph's existing context transform seam. +7. Blocking P0, P1, P2 issues: two P2 findings above. +8. Non-blocking follow-ups: P3 items above. + +## Verification + +- `bun test src/chat/__tests__/transcript-search.test.ts src/chat/__tests__/continuity-context.test.ts src/agent/__tests__/reflective-transcript-tools.test.ts`: pass, 13 tests. +- `bun run lint`: pass. +- `bun run typecheck`: pass. diff --git a/research/chat-experience/phase-10i-transcript-search-research.md b/research/chat-experience/phase-10i-transcript-search-research.md new file mode 100644 index 00000000..270a36fa --- /dev/null +++ b/research/chat-experience/phase-10i-transcript-search-research.md @@ -0,0 +1,553 @@ +# Phase 10I Transcript Search Research + +Date: 2026-05-01 + +Scope: research only. No application code changes. + +## Executive Recommendation + +Build a small SQLite-backed transcript lookup tool for the agent. Do not use +Qdrant for this first slice. + +The first production slice should expose one in-process MCP tool, +`phantom_transcript_search`, backed by a shared `src/chat/transcript-search.ts` +service. The default scope should be the current web chat session. The tool +should return compact, redacted snippets cited by `session_id`, `message_seq`, +and, when relevant, `stream_seq` or run timeline id. + +Keep the existing injected continuity context, but limit it to tiny high-value +facts that the agent should almost always have without asking, such as recent +page artifacts and compaction checkpoints. Do not inject arbitrary transcript +search results into every prompt. Exact recall should be agent-initiated through +the tool. + +SQLite is the right store for this slice because the exact web chat transcript +already lives in SQLite. Qdrant memory is useful for semantic memory, patterns, +and long-term summaries, but it is not a reliable source for exact transcript +details. Current Qdrant consolidation is heuristic, optional, and not wired as +the canonical store for web chat transcript turns. A vector system would add +more moving parts while still requiring a safe SQLite source of truth. + +The most important boundary: `chat_stream_events` has useful detail, including +tool inputs, tool result previews, and compaction boundaries, but it is swept +after 24 hours by `src/chat/sweep.ts`. Durable exact recall must rely first on +`chat_messages`, `chat_attachments`, and `chat_run_timelines`, plus a new +durable safe transcript index for selected event-derived facts that must +survive event-log sweep. + +The smallest valuable implementation is: + +1. Add a safe transcript search service over current durable chat tables. +2. Add `phantom_transcript_search` as an in-process MCP tool available to the + agent in every chat query. +3. Persist safe artifact/search chunks for page artifacts at run time so page + URLs do not disappear when `chat_stream_events` is swept. +4. Teach the prompt to use the tool for exact earlier wording, prior user + details, old artifact URLs, and post-compaction recall. + +This is a product trust feature. The user should be able to ask "what did I say +earlier?" after compaction, reload, or process restart, and the agent should +recover the answer without asking the user to manually search. + +## Current Persistence Model + +### Durable committed messages + +`src/chat/message-store.ts` stores committed rows in `chat_messages`: + +- `session_id` +- numeric message `seq` +- `role` +- `content_json` +- status, stop reason, model, cost, token counts, and error fields + +This is the highest-fidelity durable source for committed user and assistant +turns. `ChatSessionWriter` commits user rows before invoking the runtime and +assistant rows only after successful completion. Current writer tests verify +that non-success SDK results do not commit a fake successful assistant message. + +For user attachments, current code stores safe transcript metadata, not raw +payloads. `src/chat/message-builder.ts` builds SDK-native messages with base64 +or text payloads for the provider, but `ChatSessionWriter` commits only +attachment metadata plus the user text into `chat_messages.content_json`. +`src/chat/__tests__/writer.test.ts` verifies that raw base64 is not persisted in +the transcript row. + +Limitations: + +- Assistant rows currently persist final response text, not the complete + interleaving of text, tool calls, and thinking. +- Thinking deltas must not be part of transcript lookup. +- Raw attachment payloads live on disk under chat attachment storage and should + never be read by transcript search. + +### Durable chat sessions and attachments + +`src/db/schema.ts` defines `chat_sessions`, `chat_messages`, +`chat_attachments`, `chat_stream_events`, and `chat_run_timelines`. + +`src/chat/session-store.ts` soft-deletes chat sessions and hard-deletes deleted +sessions after 30 days. Hard delete removes timelines, stream events, +attachments, messages, and the session row. Transcript search should respect +this lifecycle. + +`src/chat/attachment-store.ts` persists attachment metadata and storage paths. +Search should expose filename, MIME type, size, and message linkage only. It +should not dereference `storage_path`. + +### Stream event log + +`src/chat/event-log.ts` stores every SSE frame in `chat_stream_events` with: + +- `session_id` +- optional `message_id` +- stream `seq` +- `event_type` +- raw `payload_json` +- `created_at` + +This table has enough fidelity for recent replay and recent continuity: + +- streamed text deltas +- tool call starts +- tool call inputs +- tool result previews and truncated outputs +- compaction boundaries +- MCP status +- run status and errors + +`src/chat/continuity-context.ts` already uses this event log to recover page +artifacts from `phantom_create_page` and `phantom_preview_page`, while excluding +`phantom_generate_login` auth links. It also includes recent compaction +checkpoints. + +The problem is durability. `src/chat/sweep.ts` deletes stream events older than +24 hours. That is correct for replay storage, but it means the event log cannot +be the only source for historical transcript lookup. Event-derived facts that +matter after 24 hours need to be copied into a durable safe index or represented +in durable run timelines. + +### Durable run timeline + +`src/chat/run-timeline.ts` stores a compact safe run summary in +`chat_run_timelines`. It already redacts many credential shapes, drops raw +thinking, avoids raw command output, tracks tools and subagents, and persists +compaction metadata. + +This is useful for search, especially for tool names, safe input summaries, +safe output summaries, subagent summaries, errors, and compaction boundaries. +It is not enough for exact artifact recovery today because page tool outputs +are summarized as generic "Tool produced output." rather than persisted as +first-class artifacts. + +### Murph compaction and replay + +`/Users/truffle/work/murph/packages/core/src/query/query.ts` and +`/Users/truffle/work/murph/packages/core/src/query/session-replay.ts` show that +Murph compaction affects provider context and future replay, not Phantom's +SQLite chat store. Murph writes compact checkpoints and future replay starts +from the latest compact summary. That is correct for provider context health, +but it means Phantom must provide a separate exact lookup path for compacted-out +details. + +`/Users/truffle/work/murph/packages/core/src/substrate/pi-harness.ts` passes +`transformContext` through to Pi, and Phase 10G already uses that seam for +small host facts. Transcript search should use tools for exact lookup, not +try to stuff all prior detail into `transformContext`. + +### Qdrant memory + +`src/memory/system.ts`, `src/memory/consolidation.ts`, and +`src/memory/context-builder.ts` show that Qdrant memory is optional, dependent +on Qdrant and Ollama health, and stores episodes, facts, and procedures. The +current consolidation path creates heuristic episodes and facts from session +summaries. It is not the exact web chat transcript. + +Therefore Qdrant should remain a semantic memory layer. Transcript lookup +should use SQLite. + +## Proposed Transcript Search Contract + +### Agent tool + +Add an in-process MCP tool named `phantom_transcript_search`. + +Suggested description: + +```text +Search the durable Phantom chat transcript for exact earlier user messages, +assistant replies, safe tool/artifact summaries, and attachment metadata. Use +this when the user asks about something earlier in this chat, when compaction +may have removed details from provider context, or when you need the exact URL, +filename, sequence, wording, or prior answer. Results are redacted and safe to +quote, with session and sequence citations. +``` + +Suggested input schema: + +```ts +{ + query?: string; + scope?: "current_session" | "all_sessions"; + session_id?: string; + roles?: Array<"user" | "assistant" | "tool" | "system">; + kinds?: Array< + | "message" + | "attachment" + | "artifact" + | "tool_input" + | "tool_result" + | "compaction" + | "error" + >; + since?: string; + until?: string; + seq_from?: number; + seq_to?: number; + artifact_only?: boolean; + limit?: number; +} +``` + +Defaults: + +- `scope`: `current_session` +- `session_id`: injected by the tool server from the active chat session when + scope is `current_session` +- `roles`: user, assistant, tool +- `kinds`: message, attachment, artifact, tool_input, tool_result, compaction, + error +- `limit`: 10, max 25 + +For `all_sessions`, require at least one of `query`, `since`, `until`, or +`artifact_only`, and default to a recent bounded window such as 30 days when no +explicit date window is supplied. This prevents accidental whole-database scans. + +Suggested output: + +```ts +{ + query: string | null; + scope: "current_session" | "all_sessions"; + results: Array<{ + session_id: string; + message_seq?: number; + stream_seq?: number; + run_id?: string; + role: "user" | "assistant" | "tool" | "system"; + kind: string; + created_at: string; + citation: string; + snippet: string; + metadata?: { + tool_name?: string; + attachment_id?: string; + filename?: string; + mime_type?: string; + size_bytes?: number; + url?: string; + path?: string; + }; + redacted: boolean; + }>; + warnings: string[]; +} +``` + +Citation format should be deterministic and compact: + +- `chat:#msg:` for committed message rows +- `chat:#stream:` for safe event-derived entries +- `chat:#run:` for run timeline summaries + +### Search sources + +For the first slice, search these durable sources: + +1. `chat_messages` + - user text + - assistant final text + - attachment metadata blocks in `content_json` + +2. `chat_run_timelines` + - status + - compaction metadata + - safe tool input/output summaries + - safe subagent summaries + - safe error summaries + +3. A new durable safe artifact/search chunk table + - page URLs and paths from `phantom_create_page` + - preview metadata from `phantom_preview_page` + - future first-class artifacts + +`chat_stream_events` may be used as a recent fallback and as the source for +building artifact chunks, but it should not be the only durable search source +because it is swept after 24 hours. + +### SQLite approach + +SQLite is enough. Prefer a safe transcript index table in SQLite, optionally +with FTS5. I verified Bun's SQLite build supports FTS5 locally. + +The safest first implementation is: + +- Extract safe searchable chunks in TypeScript. +- Store only redacted, compact text in a new durable table. +- Query with SQLite FTS5 if the table is materialized, or with bounded LIKE + over extracted durable rows for the narrow first slice. +- Never index raw attachment bytes, raw thinking, raw secret tool outputs, or + magic login tokens. + +FTS5 is an implementation detail, not a product requirement. The product +contract is safe cited lookup. If FTS5 makes the implementation harder, bounded +SQLite scans over durable rows are acceptable for the first PR because the +default scope is a single chat session and result limits are small. + +### Tool versus continuity versus API + +Use all three surfaces over time, but not all in the first slice. + +First slice: + +- Agent tool: yes. This is the main product behavior. The agent should recover + details without asking the user to search. +- Injected continuity summary: keep it, but narrow. Use it for recent page + artifacts and compaction checkpoints only. +- User-visible API: not required. Do not force manual search into the product + path. + +Later slice: + +- Add an authenticated `/chat/sessions/:id/search` API and UI search panel if + the chat UI needs manual search. That should call the same service as the + tool. + +## Safety And Redaction Model + +Transcript lookup must be safe to paste directly into model context. + +Rules: + +1. Never read attachment storage files. + Search returns attachment metadata only: filename, MIME type, size, and + message citation. + +2. Never return thinking deltas. + Exclude `message.thinking_start`, `message.thinking_delta`, and + `message.thinking_end` from search chunks. + +3. Never return raw login links or magic tokens. + Exclude `phantom_generate_login` outputs and redact `/ui/login?...` URLs. + Keep the Phase 10G distinction between page URLs and auth URLs. + +4. Never return secret tool values. + Exclude `phantom_get_secret` values and `phantom_collect_secrets` payloads. + If a secret-related tool appears, return only that a secret workflow + happened, not field values or magic links. + +5. Redact credential-shaped text everywhere. + Move the existing redaction logic from `src/chat/run-timeline.ts` into a + shared chat redaction helper and apply it to message snippets, tool input + summaries, tool output summaries, errors, URLs, paths, and metadata. + +6. Prefer summaries over raw tool output. + Use `chat_run_timelines.summary.tools[].safeOutputSummary` for tools. Do not + dereference `full_ref`. Do not expose `output` from raw stream events except + through a strict artifact parser and redactor. + +7. Bound result size. + Snippets should be short, for example 240 to 500 characters. Tool result + summaries should be even shorter. `limit` should max at 25. + +8. Include redaction signals. + Every result should carry `redacted: true` if the snippet or metadata passed + through redaction and changed. + +9. Respect deleted sessions. + Search must ignore `chat_sessions.deleted_at IS NOT NULL` and must not + return rows for hard-deleted sessions. + +10. Treat broad all-session search as privileged and bounded. + The in-process tool is only available to the agent running inside the + owner's Phantom process, but it should still require a query or date window + for all-session searches. + +## Test Plan + +### Unit tests for the search service + +Add `src/chat/__tests__/transcript-search.test.ts`. + +Cases: + +1. Searches current-session user and assistant `chat_messages` by exact canary + text and returns `chat:#msg:`. +2. Supports `roles`, `seq_from`, `seq_to`, `since`, `until`, and `limit`. +3. Ignores deleted sessions. +4. Returns attachment metadata from user transcript content and does not read + or return raw attachment payloads. +5. Searches durable run timeline summaries for safe tool names, safe input + summaries, compaction checkpoints, and errors. +6. Redacts API keys, bearer tokens, cookies, private keys, magic tokens, and + auth query parameters from snippets and metadata. +7. Excludes thinking deltas entirely. +8. Excludes `phantom_generate_login` magic links while preserving safe + `/ui/` artifact URLs. +9. Keeps working after `eventLog.sweep(24)` by finding committed messages and + durable artifact chunks, not relying only on `chat_stream_events`. + +### Tool server tests + +Add `src/agent/__tests__/transcript-tools.test.ts` or extend a focused +in-process MCP test. + +Cases: + +1. `phantom_transcript_search` returns JSON with compact cited results. +2. Current-session scope injects the active chat session id and refuses to + search an unrelated session unless `scope: "all_sessions"` is explicit. +3. `limit` is capped. +4. Broad all-session search without query/date/artifact filter returns a clean + validation error. + +### Chat writer and artifact tests + +Extend `src/chat/__tests__/writer.test.ts` or add a focused index test. + +Cases: + +1. User message commit writes a durable searchable chunk. +2. Assistant successful commit writes a durable searchable chunk. +3. Non-success SDK result does not write an assistant success chunk. +4. `phantom_create_page` and `phantom_preview_page` events produce durable safe + artifact chunks. +5. `phantom_generate_login` events do not produce artifact chunks. + +### Agent boundary tests + +Extend `src/agent/__tests__/agent-sdk-boundary-callers.test.ts`. + +Cases: + +1. Chat query path includes the transcript tool server in `mcpServers`. +2. Stale resume retry recreates the transcript tool server factory, matching + the current factory pattern for in-process MCP servers. +3. Murph chat queries still receive continuity via `transformContext`, and the + transcript tool remains available as an MCP server. + +### Post-compaction recovery test + +Add a deterministic backend test that does not depend on model judgment: + +1. Seed a chat session with an early user message containing a unique canary. +2. Add enough later transcript rows and a `session.compact_boundary` event to + represent compaction. +3. Invoke `phantom_transcript_search` with the current session id and the canary + query. +4. Assert the result returns the canary with a `chat:#msg:` + citation. + +Then add one live verification for the full behavior: + +1. Run Phantom locally with `PHANTOM_AGENT_RUNTIME=murph` and a small context + model or aggressive auto-compaction settings. +2. Put a unique early detail in chat, such as `PHASE10I_CANARY_`. +3. Force a long enough interaction to emit a compaction boundary. +4. Ask "what was the canary I gave you earlier?" +5. Verify the answer recovers the detail and the run timeline shows the + transcript search tool call after compaction. + +## Builder Slice Proposal + +### Scope + +One PR should implement exact, safe, current-session transcript lookup for +committed messages, attachment metadata, durable run summaries, and page +artifacts. All-session search can be included if bounded and using the same +service, but it should not expand into a full UI search product. + +### Likely files touched + +Production files: + +1. `src/db/schema.ts` + - Optional: add `chat_transcript_entries` and `chat_transcript_entries_fts`. + - If the builder chooses bounded on-demand search first, this file may not + need to change. + +2. `src/chat/transcript-search.ts` + - New shared service. + - Parses durable message rows, run timeline summaries, and safe artifact + entries. + - Applies filtering, matching, redaction, snippet building, and citation + formatting. + +3. `src/chat/transcript-redaction.ts` + - New shared redaction helper, extracted from current run timeline logic. + - Used by transcript search and, ideally, by `run-timeline.ts` after a + narrow refactor. + +4. `src/chat/run-timeline.ts` + - Import shared redaction helper. + - No behavioral change except shared implementation. + +5. `src/chat/continuity-context.ts` + - Reuse safe artifact extraction helpers or read durable artifact chunks so + page continuity does not depend only on 24-hour stream events. + +6. `src/agent/in-process-transcript-tools.ts` + - New in-process MCP server factory for `phantom_transcript_search`. + - Keeps transcript lookup separate from reflective memory, which currently + exposes `phantom_memory_search` and `phantom_list_sessions`. + +7. `src/index.ts` + - Register the new factory, for example `"phantom-transcript"`. + +8. `src/agent/prompt-assembler.ts` + - Add short guidance: use `phantom_transcript_search` for exact earlier chat + wording, prior user details, artifact URLs, filenames, and post-compaction + recall. + +9. `src/chat/writer.ts` + - If using a materialized durable index, write user, assistant, and artifact + chunks at commit time. + - If using on-demand search, no change needed except possibly wiring a + transcript search dependency into the tool server. + +Test files: + +1. `src/chat/__tests__/transcript-search.test.ts` +2. `src/agent/__tests__/transcript-tools.test.ts` +3. `src/chat/__tests__/writer.test.ts` +4. `src/chat/__tests__/continuity-context.test.ts` +5. `src/chat/__tests__/run-timeline.test.ts` +6. `src/agent/__tests__/prompt-assembler.test.ts` +7. `src/agent/__tests__/agent-sdk-boundary-callers.test.ts` + +### Acceptance criteria + +1. After compaction, an agent tool call can recover an exact earlier user detail + from the current chat session with a session and sequence citation. +2. Search works after browser reload and process restart because it reads + durable SQLite data. +3. Search does not depend on Qdrant, Ollama, provider memory, or raw provider + context. +4. Search results never include raw attachment payloads, thinking deltas, + secret values, magic login links, or large raw tool outputs. +5. Page artifacts from `phantom_create_page` and `phantom_preview_page` are + recoverable as page URLs, while `phantom_generate_login` remains excluded. +6. Broad searches are bounded by scope, date, role, sequence, kind, and limit. +7. Existing continuity context keeps working and can eventually be backed by + the same safe artifact extraction contract. + +### Anti-patterns to avoid + +1. Do not use Qdrant as the exact transcript source. +2. Do not inject large transcript search results into every prompt. +3. Do not expose a manual-only UI search path and call the feature done. +4. Do not read attachment files during transcript lookup. +5. Do not return raw `payload_json` from `chat_stream_events`. +6. Do not rely on `chat_stream_events` for history older than 24 hours. +7. Do not treat auth links as page artifacts. +8. Do not add a generic data dump tool. This is a narrow safe recall tool. diff --git a/src/agent/__tests__/reflective-transcript-tools.test.ts b/src/agent/__tests__/reflective-transcript-tools.test.ts new file mode 100644 index 00000000..cb7d5fad --- /dev/null +++ b/src/agent/__tests__/reflective-transcript-tools.test.ts @@ -0,0 +1,106 @@ +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { MIGRATIONS } from "../../db/schema.ts"; +import { createReflectiveToolServer } from "../in-process-reflective-tools.ts"; + +type ToolResult = { + content: Array<{ type: "text"; text: string }>; + isError?: true; +}; + +type RegisteredTool = { + handler: (input: unknown) => Promise | ToolResult; +}; + +let db: Database; + +function registeredTools(): Record { + const server = createReflectiveToolServer(null, db, { currentChatSessionId: "chat-1" }); + const instance = server.instance as unknown as { _registeredTools: Record }; + return instance._registeredTools; +} + +function insertMessage(input: { id: string; seq: number; role: "user" | "assistant"; content: unknown }): void { + db.run( + `INSERT INTO chat_messages (id, session_id, seq, role, content_json) + VALUES (?, 'chat-1', ?, ?, ?)`, + [input.id, input.seq, input.role, JSON.stringify(input.content)], + ); +} + +beforeEach(() => { + db = new Database(":memory:"); + for (const sql of MIGRATIONS) { + db.run(sql); + } + db.run("INSERT INTO chat_sessions (id) VALUES ('chat-1')"); +}); + +afterEach(() => { + db.close(); +}); + +describe("phantom-reflective transcript tools", () => { + test("registers transcript search alongside memory and session tools", () => { + const tools = registeredTools(); + + expect(tools.phantom_memory_search).toBeDefined(); + expect(tools.phantom_list_sessions).toBeDefined(); + expect(tools.phantom_chat_transcript_search).toBeDefined(); + }); + + test("searches durable chat transcript with safe redaction", async () => { + insertMessage({ id: "m1", seq: 1, role: "user", content: "Remember alpha marker." }); + insertMessage({ + id: "m2", + seq: 2, + role: "assistant", + content: "alpha result with http://127.0.0.1:3112/ui/login?magic=secret-token", + }); + + const tool = registeredTools().phantom_chat_transcript_search; + const result = await tool.handler({ session_id: "chat-1", query: "alpha", limit: 10 }); + const parsed = JSON.parse(result.content[0]?.text ?? "{}") as { + count: number; + results: Array<{ seq: number; citation: string; snippet: string }>; + }; + + expect(parsed.count).toBe(2); + expect(parsed.results.map((row) => row.seq)).toEqual([2, 1]); + expect(parsed.results[0]?.citation).toBe("chat:chat-1#msg:2"); + expect(parsed.results[0]?.snippet).toContain("/ui/login?magic=[REDACTED]"); + expect(parsed.results[0]?.snippet).not.toContain("secret-token"); + }); + + test("lists recent transcript entries when query is omitted", async () => { + insertMessage({ id: "m1", seq: 1, role: "user", content: "first" }); + insertMessage({ id: "m2", seq: 2, role: "assistant", content: "second" }); + insertMessage({ id: "m3", seq: 3, role: "user", content: "third" }); + + const tool = registeredTools().phantom_chat_transcript_search; + const result = await tool.handler({ session_id: "chat-1", limit: 2 }); + const parsed = JSON.parse(result.content[0]?.text ?? "{}") as { results: Array<{ seq: number }> }; + + expect(parsed.results.map((row) => row.seq)).toEqual([3, 2]); + }); + + test("rejects transcript search outside the bound chat session", async () => { + insertMessage({ id: "m1", seq: 1, role: "user", content: "first" }); + + const tool = registeredTools().phantom_chat_transcript_search; + const result = await tool.handler({ session_id: "chat-2", limit: 1 }); + + expect(result.isError).toBe(true); + expect(result.content[0]?.text).toContain("current Phantom chat session"); + }); + + test("rejects transcript search when no chat session is bound", async () => { + const server = createReflectiveToolServer(null, db); + const instance = server.instance as unknown as { _registeredTools: Record }; + const tool = instance._registeredTools.phantom_chat_transcript_search; + const result = await tool.handler({ session_id: "chat-1", limit: 1 }); + + expect(result.isError).toBe(true); + expect(result.content[0]?.text).toContain("bound Phantom web chat run"); + }); +}); diff --git a/src/agent/chat-query.ts b/src/agent/chat-query.ts index 4bea2b58..fdf0e4e7 100644 --- a/src/agent/chat-query.ts +++ b/src/agent/chat-query.ts @@ -1,13 +1,7 @@ // Extracted chat-specific query logic for the runForChat method. // Lives outside runtime.ts to keep that file under the 300-line budget. -import { - type AgentSdkQueryOptions, - type McpServerConfig, - type SDKMessage, - type SDKUserMessage, - query, -} from "./agent-sdk.ts"; +import { type AgentSdkQueryOptions, type SDKMessage, type SDKUserMessage, query } from "./agent-sdk.ts"; type MessageParam = SDKUserMessage["message"]; import { buildAgentRuntimeEnv, resolveAgentRuntimeModel } from "../config/providers.ts"; @@ -18,6 +12,7 @@ import type { RoleTemplate } from "../roles/types.ts"; import type { CostTracker } from "./cost-tracker.ts"; import { type AgentCost, type AgentResponse, emptyCost } from "./events.ts"; import { createDangerousCommandBlocker, createFileTracker } from "./hooks.ts"; +import type { AgentMcpServerFactory } from "./mcp-server-factory.ts"; import { extractTextFromMessageParam } from "./message-param-utils.ts"; import { extractCost, extractTextFromMessage } from "./message-utils.ts"; import { createMurphContextTransform } from "./murph-context.ts"; @@ -35,7 +30,7 @@ export type ChatQueryDeps = { evolvedConfig: EvolvedConfig | null; roleTemplate: RoleTemplate | null; onboardingPrompt: string | null; - mcpServerFactories: Record McpServerConfig | Promise> | null; + mcpServerFactories: Record | null; }; export async function executeChatQuery( @@ -98,9 +93,17 @@ export async function executeChatQuery( const runSdk = async (useResume: boolean): Promise => { const permissionOptions = permissionOptionsFromConfig(deps.config); + const mcpFactoryContext = { + sessionKey, + channelId, + conversationId, + ...(channelId === "web" && conversationId ? { chatSessionId: conversationId } : {}), + }; const mcpServers = deps.mcpServerFactories ? Object.fromEntries( - await Promise.all(Object.entries(deps.mcpServerFactories).map(async ([k, f]) => [k, await f()] as const)), + await Promise.all( + Object.entries(deps.mcpServerFactories).map(async ([k, f]) => [k, await f(mcpFactoryContext)] as const), + ), ) : undefined; const queryOptions: AgentSdkQueryOptions = { diff --git a/src/agent/in-process-reflective-tools.ts b/src/agent/in-process-reflective-tools.ts index 2057a95b..662cb237 100644 --- a/src/agent/in-process-reflective-tools.ts +++ b/src/agent/in-process-reflective-tools.ts @@ -6,17 +6,18 @@ // clients (phantom_memory_query, phantom_history). Those are served by // src/mcp/tools-universal.ts. The Agent SDK subprocess cannot see the external // MCP server without going through HTTP, so we expose a thin in-process server -// with two tools and register it via runtime.setMcpServerFactories() in +// with reflective tools and register it via runtime.setMcpServerFactories() in // src/index.ts. // -// Naming note: we call the tools phantom_memory_search and -// phantom_list_sessions (matching the SKILL.md allowed-tools field) even -// though the external server's equivalents are called phantom_memory_query -// and phantom_history. The builder brief and the skill catalog use the new -// names; the old external-facing names stay for backward compatibility. +// Naming note: we call the legacy memory/session aliases phantom_memory_search +// and phantom_list_sessions (matching the SKILL.md allowed-tools field) even +// though the external server's equivalents are called phantom_memory_query and +// phantom_history. The builder brief and the skill catalog use the new names; +// the old external-facing names stay for backward compatibility. import type { Database } from "bun:sqlite"; import { z } from "zod"; +import { searchChatTranscript } from "../chat/transcript-search.ts"; import type { MemorySystem } from "../memory/system.ts"; import type { RecallOptions } from "../memory/types.ts"; import { type McpSdkServerConfigWithInstance, createSdkMcpServer, tool } from "./agent-sdk.ts"; @@ -40,7 +41,15 @@ function daysAgo(n: number): Date { return d; } -export function createReflectiveToolServer(memory: MemorySystem | null, db: Database): McpSdkServerConfigWithInstance { +export type ReflectiveToolServerOptions = { + currentChatSessionId?: string; +}; + +export function createReflectiveToolServer( + memory: MemorySystem | null, + db: Database, + options: ReflectiveToolServerOptions = {}, +): McpSdkServerConfigWithInstance { const memorySearch = tool( "phantom_memory_search", `Search the agent's persistent memory for past sessions, topics, and facts. Supports semantic search and temporal filtering. @@ -164,8 +173,52 @@ Each row has session_key, channel_id, conversation_id, status, total_cost_usd, t }, ); + const chatTranscriptSearch = tool( + "phantom_chat_transcript_search", + `Search or list the durable transcript for a Phantom web chat session. Use this after Murph compaction when an older detail from the current chat is missing from provider context. + +- session_id: the current Phantom chat session id from the block. +- query: optional search text. If omitted, returns recent committed transcript entries. +- role: "user", "assistant", or "all". Default is "all". +- before_seq / after_seq: optional transcript sequence window. +- limit: max results. Default 10, max 50. + +Returns compact, cited snippets with seq, role, created_at, status, and attachment metadata. Output is redacted for credentials, magic-login tokens, and large base64 payloads. This is operational transcript recovery, not long-term user memory.`, + { + session_id: z.string().min(1).describe("Current Phantom chat session id"), + query: z.string().optional().describe("Optional search text; omit to list recent transcript entries"), + role: z.enum(["user", "assistant", "all"]).optional().default("all"), + before_seq: z.number().int().min(1).optional().describe("Only return messages before this transcript seq"), + after_seq: z.number().int().min(0).optional().describe("Only return messages after this transcript seq"), + limit: z.number().int().min(1).max(50).optional().default(10), + }, + async (input) => { + if (!options.currentChatSessionId) { + return err("Transcript search is only available inside a bound Phantom web chat run."); + } + if (input.session_id !== options.currentChatSessionId) { + return err("Transcript search is restricted to the current Phantom chat session."); + } + try { + return ok( + searchChatTranscript(db, { + sessionId: input.session_id, + query: input.query, + role: input.role, + beforeSeq: input.before_seq, + afterSeq: input.after_seq, + limit: input.limit, + }), + ); + } catch (caught: unknown) { + const msg = caught instanceof Error ? caught.message : String(caught); + return err(`Transcript search failed: ${msg}`); + } + }, + ); + return createSdkMcpServer({ name: "phantom-reflective", - tools: [memorySearch, listSessions], + tools: [memorySearch, listSessions, chatTranscriptSearch], }); } diff --git a/src/agent/mcp-server-factory.ts b/src/agent/mcp-server-factory.ts new file mode 100644 index 00000000..b7e228fc --- /dev/null +++ b/src/agent/mcp-server-factory.ts @@ -0,0 +1,12 @@ +import type { McpServerConfig } from "./agent-sdk.ts"; + +export type AgentMcpServerFactoryContext = { + sessionKey: string; + channelId: string; + conversationId: string; + chatSessionId?: string; +}; + +export type AgentMcpServerFactory = ( + context?: AgentMcpServerFactoryContext, +) => McpServerConfig | Promise; diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts index 64917696..181bf6da 100644 --- a/src/agent/runtime.ts +++ b/src/agent/runtime.ts @@ -1,5 +1,5 @@ import type { Database } from "bun:sqlite"; -import { type McpServerConfig, type SDKMessage, type SDKUserMessage, query } from "./agent-sdk.ts"; +import { type SDKMessage, type SDKUserMessage, query } from "./agent-sdk.ts"; type MessageParam = SDKUserMessage["message"]; import { buildAgentRuntimeEnv, resolveAgentRuntimeModel } from "../config/providers.ts"; @@ -13,6 +13,7 @@ import { type AgentCost, type AgentResponse, emptyCost } from "./events.ts"; import { createDangerousCommandBlocker, createFileTracker } from "./hooks.ts"; import { emitPluginInitSnapshot } from "./init-plugin-snapshot.ts"; import { type JudgeQueryOptions, type JudgeQueryResult, runJudgeQuery } from "./judge-query.ts"; +import type { AgentMcpServerFactory } from "./mcp-server-factory.ts"; import { wrapMessageContent } from "./message-param-utils.ts"; import { extractCost, extractTextFromMessage } from "./message-utils.ts"; import { permissionOptionsFromConfig } from "./permission-options.ts"; @@ -38,7 +39,7 @@ export class AgentRuntime { private roleTemplate: RoleTemplate | null = null; private onboardingPrompt: string | null = null; private lastTrackedFiles: string[] = []; - private mcpServerFactories: Record McpServerConfig | Promise> | null = null; + private mcpServerFactories: Record | null = null; constructor(config: PhantomConfig, db: Database) { this.config = config; @@ -62,7 +63,7 @@ export class AgentRuntime { this.onboardingPrompt = prompt; } - setMcpServerFactories(factories: Record McpServerConfig | Promise>): void { + setMcpServerFactories(factories: Record): void { this.mcpServerFactories = factories; } @@ -199,6 +200,7 @@ export class AgentRuntime { const runSdkQuery = async (useResume: boolean): Promise => { const permissionOptions = permissionOptionsFromConfig(this.config); + const mcpFactoryContext = { sessionKey, channelId, conversationId }; const queryStream = query({ prompt: text, options: { @@ -218,7 +220,9 @@ export class AgentRuntime { ? { mcpServers: Object.fromEntries( await Promise.all( - Object.entries(this.mcpServerFactories).map(async ([k, f]) => [k, await f()] as const), + Object.entries(this.mcpServerFactories).map( + async ([k, f]) => [k, await f(mcpFactoryContext)] as const, + ), ), ), } diff --git a/src/chat/__tests__/continuity-context.test.ts b/src/chat/__tests__/continuity-context.test.ts index dbbe9850..f3e40c34 100644 --- a/src/chat/__tests__/continuity-context.test.ts +++ b/src/chat/__tests__/continuity-context.test.ts @@ -55,6 +55,8 @@ describe("buildChatContinuityContext", () => { const context = buildChatContinuityContext({ sessionId: session.id, eventLog }); + expect(context).toContain(`Current Phantom chat session id: ${session.id}`); + expect(context).toContain("phantom_chat_transcript_search"); expect(context).toContain("User-visible page artifacts"); expect(context).toContain("Muhammad Ahmed Cheema Profile"); expect(context).toContain("http://127.0.0.1:3112/ui/muhammad-ahmed-cheema.html"); @@ -106,4 +108,14 @@ describe("buildChatContinuityContext", () => { expect(context).toContain("stream seq 13"); expect(context).toContain("500,000"); }); + + test("renders transcript recovery instructions even without artifacts or compaction", () => { + const session = sessionStore.create(); + + const context = buildChatContinuityContext({ sessionId: session.id, eventLog }); + + expect(context).toContain(`Current Phantom chat session id: ${session.id}`); + expect(context).toContain("phantom_chat_transcript_search"); + expect(context).toContain("Authentication links"); + }); }); diff --git a/src/chat/__tests__/transcript-search.test.ts b/src/chat/__tests__/transcript-search.test.ts new file mode 100644 index 00000000..4fe4a593 --- /dev/null +++ b/src/chat/__tests__/transcript-search.test.ts @@ -0,0 +1,230 @@ +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { MIGRATIONS } from "../../db/schema.ts"; +import { ChatSessionStore } from "../session-store.ts"; +import { searchChatTranscript } from "../transcript-search.ts"; + +let db: Database; + +function insertMessage(input: { + id: string; + sessionId?: string; + seq: number; + role: "user" | "assistant"; + content: unknown; +}): void { + db.run( + `INSERT INTO chat_messages (id, session_id, seq, role, content_json, created_at) + VALUES (?, ?, ?, ?, ?, ?)`, + [ + input.id, + input.sessionId ?? "chat-1", + input.seq, + input.role, + JSON.stringify(input.content), + `2026-05-01T00:00:0${input.seq}.000Z`, + ], + ); +} + +beforeEach(() => { + db = new Database(":memory:"); + for (const sql of MIGRATIONS) { + db.run(sql); + } + db.run("INSERT INTO chat_sessions (id) VALUES ('chat-1')"); + db.run("INSERT INTO chat_sessions (id) VALUES ('chat-2')"); +}); + +afterEach(() => { + db.close(); +}); + +describe("searchChatTranscript", () => { + test("finds compact cited snippets in the current session", () => { + insertMessage({ id: "m1", seq: 1, role: "user", content: "Build a profile page for Chima." }); + insertMessage({ + id: "m2", + seq: 2, + role: "assistant", + content: "Created http://127.0.0.1:3112/ui/muhammad-ahmed-cheema.html", + }); + insertMessage({ + id: "m3", + sessionId: "chat-2", + seq: 1, + role: "assistant", + content: "Other session with Chima should not leak.", + }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", query: "cheema", limit: 5 }); + + expect(result.count).toBe(1); + expect(result.results[0]).toMatchObject({ + id: "m2", + session_id: "chat-1", + seq: 2, + role: "assistant", + citation: "chat:chat-1#msg:2", + }); + expect(result.results[0]?.snippet).toContain("muhammad-ahmed-cheema.html"); + }); + + test("lists recent entries when query is omitted", () => { + insertMessage({ id: "m1", seq: 1, role: "user", content: "one" }); + insertMessage({ id: "m2", seq: 2, role: "assistant", content: "two" }); + insertMessage({ id: "m3", seq: 3, role: "user", content: "three" }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", limit: 2 }); + + expect(result.query).toBeNull(); + expect(result.results.map((row) => row.seq)).toEqual([3, 2]); + }); + + test("supports role and seq filters", () => { + insertMessage({ id: "m1", seq: 1, role: "user", content: "alpha" }); + insertMessage({ id: "m2", seq: 2, role: "assistant", content: "alpha assistant" }); + insertMessage({ id: "m3", seq: 3, role: "user", content: "alpha later" }); + + const result = searchChatTranscript(db, { + sessionId: "chat-1", + query: "alpha", + role: "user", + beforeSeq: 3, + limit: 10, + }); + + expect(result.results.map((row) => row.seq)).toEqual([1]); + }); + + test("redacts credentials, magic login tokens, and large base64 payloads", () => { + insertMessage({ + id: "m1", + seq: 1, + role: "assistant", + content: `Open http://127.0.0.1:3112/ui/login?magic=secret-token and use api_key=abc123 with ${"A".repeat(220)}`, + }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", query: "open", limit: 1 }); + const snippet = result.results[0]?.snippet ?? ""; + + expect(snippet).toContain("/ui/login?magic=[REDACTED]"); + expect(snippet).toContain("api_key=[REDACTED]"); + expect(snippet).toContain("[REDACTED_BLOB]"); + expect(snippet).not.toContain("secret-token"); + expect(snippet).not.toContain("abc123"); + }); + + test("summarizes attachments without returning payload bytes", () => { + insertMessage({ + id: "m1", + seq: 1, + role: "user", + content: [ + { + type: "attachment", + id: "att-1", + filename: "brief.pdf", + mime_type: "application/pdf", + size_bytes: 1234, + preview_url: "/chat/attachments/att-1/preview", + }, + { type: "text", text: "Please inspect the attached brief." }, + ], + }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", query: "brief", limit: 1 }); + + expect(result.results[0]?.attachments).toEqual([ + { filename: "brief.pdf", mime_type: "application/pdf", size_bytes: 1234 }, + ]); + expect(result.results[0]?.snippet).toContain("[attachment: brief.pdf application/pdf]"); + }); + + test("redacts credential-shaped attachment metadata in snippets and metadata", () => { + insertMessage({ + id: "m1", + seq: 1, + role: "user", + content: [ + { + type: "attachment", + filename: "AKIAABCDEFGHIJKLMNOP.pdf", + mime_type: "application/x-api-key=raw-secret", + size_bytes: 1, + source: { data: "RAW_BASE64_PAYLOAD" }, + }, + { type: "text", text: "alpha attachment" }, + ], + }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", query: "alpha", limit: 1 }); + const entry = result.results[0]; + + expect(entry?.snippet).toContain("[REDACTED_AWS_KEY].pdf"); + expect(entry?.attachments).toEqual([ + { filename: "[REDACTED_AWS_KEY].pdf", mime_type: "application/x-api-key=[REDACTED]", size_bytes: 1 }, + ]); + expect(JSON.stringify(entry)).not.toContain("AKIAABCDEFGHIJKLMNOP"); + expect(JSON.stringify(entry)).not.toContain("raw-secret"); + expect(JSON.stringify(entry)).not.toContain("RAW_BASE64_PAYLOAD"); + }); + + test("does not return messages from deleted sessions", () => { + db.run("UPDATE chat_sessions SET status = 'deleted' WHERE id = 'chat-1'"); + insertMessage({ id: "m1", seq: 1, role: "user", content: "deleted session detail" }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", query: "deleted", limit: 10 }); + + expect(result.count).toBe(0); + }); + + test("does not return messages from soft-deleted sessions", () => { + new ChatSessionStore(db).softDelete("chat-1"); + insertMessage({ id: "m1", seq: 1, role: "user", content: "soft deleted session detail" }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", query: "soft", limit: 10 }); + + expect(result.count).toBe(0); + }); + + test("redacts private keys, AWS keys, and line-wrapped base64", () => { + const wrappedBase64 = `${"A".repeat(64)}\n${"B".repeat(64)}\n${"C".repeat(64)}`; + insertMessage({ + id: "m1", + seq: 1, + role: "assistant", + content: `alpha AKIAABCDEFGHIJKLMNOP -----BEGIN PRIVATE KEY-----\nraw-private-key\n-----END PRIVATE KEY-----\n${wrappedBase64}`, + }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", query: "alpha", limit: 1 }); + const snippet = result.results[0]?.snippet ?? ""; + + expect(snippet).toContain("[REDACTED_AWS_KEY]"); + expect(snippet).toContain("[REDACTED_PRIVATE_KEY]"); + expect(snippet).toContain("[REDACTED_BLOB]"); + expect(snippet).not.toContain("AKIAABCDEFGHIJKLMNOP"); + expect(snippet).not.toContain("raw-private-key"); + expect(snippet).not.toContain("AAAA"); + }); + + test("omits unknown structured payloads instead of stringifying them", () => { + insertMessage({ + id: "m1", + seq: 1, + role: "assistant", + content: { + type: "unknown", + source: { data: "RAW_BASE64_PAYLOAD" }, + api_key: "plain-secret", + }, + }); + + const result = searchChatTranscript(db, { sessionId: "chat-1", limit: 1 }); + const snippet = result.results[0]?.snippet ?? ""; + + expect(snippet).toBe("[structured content omitted]"); + expect(snippet).not.toContain("RAW_BASE64_PAYLOAD"); + expect(snippet).not.toContain("plain-secret"); + }); +}); diff --git a/src/chat/continuity-context.ts b/src/chat/continuity-context.ts index 19e51949..84ecc75e 100644 --- a/src/chat/continuity-context.ts +++ b/src/chat/continuity-context.ts @@ -78,20 +78,24 @@ export function buildChatContinuityContext(input: BuildChatContinuityContextInpu const artifacts = dedupeArtifacts([...tools.values()].flatMap((tool) => artifactFromTool(tool) ?? [])); const latestCompactions = compactions.slice(-MAX_COMPACTIONS); - if (artifacts.length === 0 && latestCompactions.length === 0) { - return undefined; - } return renderContext({ + sessionId: input.sessionId, artifacts: artifacts.slice(-MAX_ARTIFACTS), compactions: latestCompactions, }); } -function renderContext(input: { artifacts: PageArtifact[]; compactions: CompactCheckpoint[] }): string { +function renderContext(input: { + sessionId: string; + artifacts: PageArtifact[]; + compactions: CompactCheckpoint[]; +}): string { const lines = [ "Durable Phantom chat context:", + `- Current Phantom chat session id: ${input.sessionId}.`, "- The transcript may have been compacted by Murph. Continue from the latest user message using these host facts when relevant.", + "- If an older detail is missing after compaction, call phantom_chat_transcript_search with the current chat session id before asking the user to repeat it.", "- Authentication links from phantom_generate_login are not page artifacts.", ]; diff --git a/src/chat/redaction.ts b/src/chat/redaction.ts new file mode 100644 index 00000000..01ca606e --- /dev/null +++ b/src/chat/redaction.ts @@ -0,0 +1,31 @@ +const PRIVATE_KEY_PATTERN = /-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?(?:-----END [A-Z ]*PRIVATE KEY-----|$)/g; +const AWS_ACCESS_KEY_PATTERN = /\b(?:AKIA|ASIA)[A-Z0-9]{16}\b/g; +const AUTH_QUERY_PATTERN = + /([?&](?:code|access_token|refresh_token|id_token|client_secret|token|secret|api_key|key|magic|password)=)[^&\s"'<>]+/gi; +const HEADER_SECRET_PATTERN = + /(\b(?:x[-_])?[a-z0-9_-]*(?:api[-_]?key|access[-_]?key|private[-_]?key|csrf[-_]?token|xsrf[-_]?token|csrf|xsrf|token|secret|password|auth|credential|session)[a-z0-9_-]*\s*:\s*)([^\s,;]+)/gi; +const ASSIGNMENT_SECRET_PATTERN = + /([a-z0-9_]*(?:api[_-]?key|access[_-]?key|private[_-]?key|token|secret|password|auth|credential|session|oauth|csrf|xsrf)[a-z0-9_]*\s*=\s*)([^\s&]+)/gi; +const UPPER_ASSIGNMENT_SECRET_PATTERN = + /(\b[A-Z][A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD|AUTH|CREDENTIAL|PRIVATE|SESSION|CODE|CSRF|XSRF)[A-Z0-9_]*\s*=\s*)([^\s&]+)/g; +const OPENAI_SECRET_PATTERN = /\b(sk-[a-z0-9_-]{12,})\b/gi; +const SINGLE_LINE_BLOB_PATTERN = /\b([a-z0-9+/]{80,}={0,2})\b/gi; +const LINE_WRAPPED_BLOB_PATTERN = /(?:\b[A-Za-z0-9+/]{40,}={0,2}\b[\r\n\t ]*){3,}/g; + +export function redactSensitiveText(value: string): string { + let output = value; + output = output.replace(PRIVATE_KEY_PATTERN, "[REDACTED_PRIVATE_KEY]"); + output = output.replace(AWS_ACCESS_KEY_PATTERN, "[REDACTED_AWS_KEY]"); + output = output.replace(AUTH_QUERY_PATTERN, "$1[REDACTED]"); + output = output.replace(/bearer\s+[a-z0-9._~+/=-]+/gi, "Bearer [REDACTED]"); + output = output.replace(/basic\s+[a-z0-9._~+/=-]+/gi, "Basic [REDACTED]"); + output = output.replace(/(authorization\s*[:=]\s*)([^\s,;]+)/gi, "$1[REDACTED]"); + output = output.replace(/(\bcookie\s*[:=]\s*)([^\n]+)/gi, "$1[REDACTED]"); + output = output.replace(HEADER_SECRET_PATTERN, "$1[REDACTED]"); + output = output.replace(ASSIGNMENT_SECRET_PATTERN, "$1[REDACTED]"); + output = output.replace(UPPER_ASSIGNMENT_SECRET_PATTERN, "$1[REDACTED]"); + output = output.replace(OPENAI_SECRET_PATTERN, "[REDACTED_SECRET]"); + output = output.replace(LINE_WRAPPED_BLOB_PATTERN, "[REDACTED_BLOB]"); + output = output.replace(SINGLE_LINE_BLOB_PATTERN, "[REDACTED_BLOB]"); + return output; +} diff --git a/src/chat/run-timeline.ts b/src/chat/run-timeline.ts index b5548181..2d601d70 100644 --- a/src/chat/run-timeline.ts +++ b/src/chat/run-timeline.ts @@ -1,4 +1,5 @@ import type { Database } from "bun:sqlite"; +import { redactSensitiveText } from "./redaction.ts"; import type { SessionErrorSubtype, StopReason } from "./types.ts"; import type { ChatWireFrame } from "./types.ts"; @@ -683,29 +684,5 @@ function isSensitiveKey(key: string): boolean { } function redact(value: string): string { - let output = value; - output = output.replace(/-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*/g, "[REDACTED_PRIVATE_KEY]"); - output = output.replace(/\b(?:AKIA|ASIA)[A-Z0-9]{16}\b/g, "[REDACTED_AWS_KEY]"); - output = output.replace( - /([?&](?:code|access_token|refresh_token|id_token|client_secret|token|secret|api_key|key)=)[^&\s]+/gi, - "$1[REDACTED]", - ); - output = output.replace(/bearer\s+[a-z0-9._~+/=-]+/gi, "Bearer [REDACTED]"); - output = output.replace(/(authorization\s*[:=]\s*)([^\s,;]+)/gi, "$1[REDACTED]"); - output = output.replace(/(\bcookie\s*[:=]\s*)([^\n]+)/gi, "$1[REDACTED]"); - output = output.replace( - /(\b(?:x[-_])?[a-z0-9_-]*(?:api[-_]?key|access[-_]?key|private[-_]?key|csrf[-_]?token|xsrf[-_]?token|csrf|xsrf|token|secret|password|auth|credential|session)[a-z0-9_-]*\s*:\s*)([^\s,;]+)/gi, - "$1[REDACTED]", - ); - output = output.replace( - /([a-z0-9_]*(?:api[_-]?key|access[_-]?key|private[_-]?key|token|secret|password|auth|credential|session|oauth|csrf|xsrf)[a-z0-9_]*\s*=\s*)([^\s&]+)/gi, - "$1[REDACTED]", - ); - output = output.replace( - /(\b[A-Z][A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD|AUTH|CREDENTIAL|PRIVATE|SESSION|CODE|CSRF|XSRF)[A-Z0-9_]*\s*=\s*)([^\s&]+)/g, - "$1[REDACTED]", - ); - output = output.replace(/\b(sk-[a-z0-9_-]{12,})\b/gi, "[REDACTED_SECRET]"); - output = output.replace(/\b([a-z0-9+/]{80,}={0,2})\b/gi, "[REDACTED_BLOB]"); - return output; + return redactSensitiveText(value); } diff --git a/src/chat/transcript-search.ts b/src/chat/transcript-search.ts new file mode 100644 index 00000000..be3de0f7 --- /dev/null +++ b/src/chat/transcript-search.ts @@ -0,0 +1,250 @@ +import type { Database } from "bun:sqlite"; +import { redactSensitiveText } from "./redaction.ts"; + +export type ChatTranscriptRole = "user" | "assistant" | "all"; + +export type ChatTranscriptSearchOptions = { + sessionId: string; + query?: string; + role?: ChatTranscriptRole; + afterSeq?: number; + beforeSeq?: number; + limit?: number; +}; + +export type ChatTranscriptEntry = { + id: string; + session_id: string; + seq: number; + role: "user" | "assistant"; + created_at: string; + status: string; + citation: string; + snippet: string; + attachments?: Array<{ + filename: string; + mime_type: string; + size_bytes: number | null; + }>; +}; + +export type ChatTranscriptSearchResult = { + session_id: string; + query: string | null; + role: ChatTranscriptRole; + limit: number; + count: number; + results: ChatTranscriptEntry[]; +}; + +type ChatMessageRow = { + id: string; + seq: number; + role: string; + content_json: string; + created_at: string; + status: string; +}; + +type ExtractedTranscriptContent = { + text: string; + attachments: ChatTranscriptEntry["attachments"]; +}; + +const DEFAULT_LIMIT = 10; +const MAX_LIMIT = 50; +const MAX_SCAN_ROWS = 2000; +const SNIPPET_RADIUS = 260; +const MAX_SNIPPET_LENGTH = 700; + +export function searchChatTranscript(db: Database, options: ChatTranscriptSearchOptions): ChatTranscriptSearchResult { + const limit = clampLimit(options.limit); + const role = options.role ?? "all"; + const query = normalizeWhitespace(options.query ?? ""); + const rows = loadCandidateRows(db, { ...options, role }, query ? MAX_SCAN_ROWS : limit); + const queryTokens = queryTokensFor(query); + const results: ChatTranscriptEntry[] = []; + + for (const row of rows) { + if (row.role !== "user" && row.role !== "assistant") continue; + const extracted = extractTranscriptContent(row.content_json); + const redactedText = redactSensitiveText(extracted.text); + if (queryTokens.length > 0 && !matchesQuery(redactedText, query, queryTokens)) { + continue; + } + const snippet = buildSnippet(redactedText, query, queryTokens); + results.push({ + id: row.id, + session_id: options.sessionId, + seq: row.seq, + role: row.role, + created_at: row.created_at, + status: row.status, + citation: `chat:${options.sessionId}#msg:${row.seq}`, + snippet, + ...(extracted.attachments && extracted.attachments.length > 0 ? { attachments: extracted.attachments } : {}), + }); + if (results.length >= limit) break; + } + + return { + session_id: options.sessionId, + query: query || null, + role, + limit, + count: results.length, + results, + }; +} + +function loadCandidateRows( + db: Database, + options: ChatTranscriptSearchOptions & { role: ChatTranscriptRole }, + limit: number, +): ChatMessageRow[] { + const clauses = ["session_id = ?"]; + const params: Array = [options.sessionId]; + clauses.push( + "EXISTS (SELECT 1 FROM chat_sessions WHERE chat_sessions.id = chat_messages.session_id AND chat_sessions.deleted_at IS NULL AND chat_sessions.status != 'deleted')", + ); + + if (options.role !== "all") { + clauses.push("role = ?"); + params.push(options.role); + } else { + clauses.push("role IN ('user', 'assistant')"); + } + if (options.afterSeq !== undefined) { + clauses.push("seq > ?"); + params.push(options.afterSeq); + } + if (options.beforeSeq !== undefined) { + clauses.push("seq < ?"); + params.push(options.beforeSeq); + } + + params.push(limit); + return db + .query( + `SELECT id, seq, role, content_json, created_at, status + FROM chat_messages + WHERE ${clauses.join(" AND ")} + ORDER BY seq DESC + LIMIT ?`, + ) + .all(...params) as ChatMessageRow[]; +} + +function clampLimit(limit: number | undefined): number { + if (typeof limit !== "number" || !Number.isFinite(limit)) return DEFAULT_LIMIT; + return Math.max(1, Math.min(MAX_LIMIT, Math.floor(limit))); +} + +function queryTokensFor(query: string): string[] { + if (!query) return []; + return query + .toLowerCase() + .split(/\s+/) + .map((token) => token.trim()) + .filter((token) => token.length > 0) + .slice(0, 12); +} + +function matchesQuery(text: string, query: string, tokens: string[]): boolean { + const normalizedText = text.toLowerCase(); + if (query && normalizedText.includes(query.toLowerCase())) return true; + return tokens.every((token) => normalizedText.includes(token)); +} + +function buildSnippet(text: string, query: string, tokens: string[]): string { + const normalized = normalizeWhitespace(text); + if (!normalized) return ""; + + const lower = normalized.toLowerCase(); + const queryLower = query.toLowerCase(); + let index = queryLower ? lower.indexOf(queryLower) : -1; + if (index < 0) { + index = + tokens + .map((token) => lower.indexOf(token)) + .filter((candidate) => candidate >= 0) + .sort((a, b) => a - b)[0] ?? 0; + } + + const start = Math.max(0, index - SNIPPET_RADIUS); + const end = Math.min(normalized.length, Math.max(index + SNIPPET_RADIUS, start + MAX_SNIPPET_LENGTH)); + const snippet = `${start > 0 ? "... " : ""}${normalized.slice(start, end)}${end < normalized.length ? " ..." : ""}`; + return snippet.length > MAX_SNIPPET_LENGTH ? `${snippet.slice(0, MAX_SNIPPET_LENGTH - 4)} ...` : snippet; +} + +function extractTranscriptContent(contentJson: string): ExtractedTranscriptContent { + let parsed: unknown; + try { + parsed = JSON.parse(contentJson); + } catch { + return { text: contentJson, attachments: [] }; + } + return extractUnknownContent(parsed); +} + +function extractUnknownContent(value: unknown): ExtractedTranscriptContent { + if (typeof value === "string") { + return { text: value, attachments: [] }; + } + if (Array.isArray(value)) { + const texts: string[] = []; + const attachments: NonNullable = []; + for (const item of value) { + const extracted = extractUnknownContent(item); + if (extracted.text) texts.push(extracted.text); + if (extracted.attachments) attachments.push(...extracted.attachments); + } + return { text: texts.join("\n"), attachments }; + } + if (!isRecord(value)) { + return { text: value === null || value === undefined ? "" : String(value), attachments: [] }; + } + + if (value.type === "attachment") { + const attachment = attachmentSummary(value); + const text = attachment ? `[attachment: ${attachment.filename} ${attachment.mime_type}]` : "[attachment]"; + return { text, attachments: attachment ? [attachment] : [] }; + } + if (value.type === "text" && typeof value.text === "string") { + return { text: value.text, attachments: [] }; + } + if (typeof value.text === "string") { + return { text: value.text, attachments: [] }; + } + if (typeof value.content === "string") { + return { text: value.content, attachments: [] }; + } + return { text: "[structured content omitted]", attachments: [] }; +} + +function attachmentSummary( + record: Record, +): NonNullable[number] | null { + const filename = stringField(record, "filename") ?? "file"; + const mimeType = stringField(record, "mime_type") ?? stringField(record, "mimeType") ?? "application/octet-stream"; + const sizeValue = record.size_bytes ?? record.sizeBytes; + const sizeBytes = typeof sizeValue === "number" && Number.isFinite(sizeValue) ? sizeValue : null; + return { + filename: redactSensitiveText(filename), + mime_type: redactSensitiveText(mimeType), + size_bytes: sizeBytes, + }; +} + +function normalizeWhitespace(value: string): string { + return value.replace(/\s+/g, " ").trim(); +} + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === "object" && !Array.isArray(value); +} + +function stringField(record: Record, key: string): string | undefined { + const value = record[key]; + return typeof value === "string" && value.trim().length > 0 ? value : undefined; +} diff --git a/src/index.ts b/src/index.ts index bfd54b16..d8bfbad5 100644 --- a/src/index.ts +++ b/src/index.ts @@ -249,7 +249,10 @@ async function main(): Promise { runtime.setMcpServerFactories({ "phantom-dynamic-tools": () => createInProcessToolServer(registry), "phantom-scheduler": () => createSchedulerToolServer(scheduler as Scheduler), - "phantom-reflective": () => createReflectiveToolServer(memory.isReady() ? memory : null, db), + "phantom-reflective": (context) => + createReflectiveToolServer(memory.isReady() ? memory : null, db, { + currentChatSessionId: context?.chatSessionId, + }), "phantom-web-ui": () => createWebUiToolServer(config.public_url, config.name), "phantom-secrets": () => createSecretToolServer({ db, baseUrl: secretsBaseUrl }), "phantom-preview": () => createPreviewToolServer(config.port), From 7dbffbb989e97319e6021adb3776e3b5595f6032 Mon Sep 17 00:00:00 2001 From: Truffle Date: Fri, 1 May 2026 05:06:21 -0700 Subject: [PATCH 4/9] Polish tool card collapse behavior --- chat-ui/src/components/tool-call-card.tsx | 11 ++- .../src/lib/__tests__/tool-disclosure.test.ts | 56 ++++++++++++++ chat-ui/src/lib/tool-disclosure.ts | 68 +++++++++++++++++ .../phase-10j-tool-card-collapse-research.md | 70 +++++++++++++++++ .../phase-10j-tool-card-collapse-research.md | 76 +++++++++++++++++++ 5 files changed, 275 insertions(+), 6 deletions(-) create mode 100644 chat-ui/src/lib/__tests__/tool-disclosure.test.ts create mode 100644 chat-ui/src/lib/tool-disclosure.ts create mode 100644 prompts/phase-10j-tool-card-collapse-research.md create mode 100644 research/chat-experience/phase-10j-tool-card-collapse-research.md diff --git a/chat-ui/src/components/tool-call-card.tsx b/chat-ui/src/components/tool-call-card.tsx index e65f8322..2e6480ed 100644 --- a/chat-ui/src/components/tool-call-card.tsx +++ b/chat-ui/src/components/tool-call-card.tsx @@ -1,4 +1,5 @@ import type { ToolCallState } from "@/lib/chat-types"; +import { initialToolDisclosureState, reconcileToolDisclosureState, toggleToolDisclosure } from "@/lib/tool-disclosure"; import { cn } from "@/lib/utils"; import { AlertCircle, Check, ChevronDown, FileText, Loader2, Shield, Terminal, XCircle } from "lucide-react"; import { useEffect, useId, useState } from "react"; @@ -161,22 +162,20 @@ export function ToolCallCard({ tool }: { tool: ToolCallState }) { const inputDetails = toolInputDetails(tool); const output = tool.output ? redactSensitiveText(truncate(tool.output, TOOL_OUTPUT_DISPLAY_LIMIT)) : ""; - const autoExpand = tool.state === "error" || tool.state === "blocked"; - const [isOpen, setIsOpen] = useState(autoExpand); + const [disclosure, setDisclosure] = useState(() => initialToolDisclosureState(tool.state)); useEffect(() => { - if (tool.state === "error" || tool.state === "blocked") { - setIsOpen(true); - } + setDisclosure((current) => reconcileToolDisclosureState(current, tool.state)); }, [tool.state]); + const isOpen = disclosure.isOpen; const hasBody = Boolean(output || tool.error || tool.blockReason || inputDetails); return (
+
+
+ ); +} diff --git a/chat-ui/src/components/assistant-message.tsx b/chat-ui/src/components/assistant-message.tsx index 0b5be3bb..7d912139 100644 --- a/chat-ui/src/components/assistant-message.tsx +++ b/chat-ui/src/components/assistant-message.tsx @@ -1,62 +1,75 @@ -import type { ChatMessage, ThinkingBlockState, ToolCallState } from "@/lib/chat-types"; +import { extractToolArtifacts } from "@/lib/chat-artifacts"; import { getAssistantTextBlocks } from "@/lib/chat-message-content"; +import type { ChatMessage, ThinkingBlockState, ToolCallState } from "@/lib/chat-types"; +import { ArtifactTray } from "./artifact-tray"; import { Markdown } from "./markdown"; import { ThinkingBlock } from "./thinking-block"; import { ToolCallCard } from "./tool-call-card"; +export type ThinkingBlockItem = { + id: string; + block: ThinkingBlockState; +}; + export function AssistantMessage({ - message, - toolCalls, - thinkingBlocks, + message, + toolCalls, + thinkingBlocks, }: { - message: ChatMessage; - toolCalls: ToolCallState[]; - thinkingBlocks: ThinkingBlockState[]; + message: ChatMessage; + toolCalls: ToolCallState[]; + thinkingBlocks: ThinkingBlockItem[]; }) { - const textBlocks = getAssistantTextBlocks(message); - const hasText = textBlocks.length > 0; - - const isStreaming = message.status === "streaming"; - - return ( -
-
- {thinkingBlocks.map((block, i) => ( - - ))} - - {toolCalls.map((tool) => ( - - ))} - - {textBlocks.map((textContent, index) => ( - - ))} - - {isStreaming && !hasText && toolCalls.length === 0 && ( -
-
-
-
-
- )} - - {message.costUsd != null && message.status === "committed" && ( -
- {message.inputTokens != null && message.outputTokens != null && ( - - {message.inputTokens.toLocaleString()} in /{" "} - {message.outputTokens.toLocaleString()} out - - )} - {message.costUsd > 0 && ( - - ${message.costUsd.toFixed(4)} - - )} -
- )} -
-
- ); + const textBlocks = getAssistantTextBlocks(message); + const artifacts = extractToolArtifacts(toolCalls); + const hasText = textBlocks.length > 0; + + const isStreaming = message.status === "streaming"; + + return ( +
+
+ {thinkingBlocks.map((item) => ( + + ))} + + {toolCalls.map((tool) => ( + + ))} + + {textBlocks.map((textContent) => ( + + ))} + + + + {isStreaming && !hasText && toolCalls.length === 0 && ( +
+
+
+
+
+ )} + + {message.costUsd != null && message.status === "committed" && ( +
+ {message.inputTokens != null && message.outputTokens != null && ( + + {message.inputTokens.toLocaleString()} in / {message.outputTokens.toLocaleString()} out + + )} + {message.costUsd > 0 && ${message.costUsd.toFixed(4)}} +
+ )} +
+
+ ); +} + +function hashText(value: string): string { + let hash = 0; + for (let i = 0; i < value.length; i += 1) { + hash = (hash * 31 + value.charCodeAt(i)) | 0; + } + return Math.abs(hash).toString(36); } diff --git a/chat-ui/src/components/message-list.tsx b/chat-ui/src/components/message-list.tsx index b8f01006..f2a6ef2e 100644 --- a/chat-ui/src/components/message-list.tsx +++ b/chat-ui/src/components/message-list.tsx @@ -4,6 +4,7 @@ import type { ChatMessage, RunActivityState, ThinkingBlockState, ToolCallState } import { Button } from "@/ui/button"; import { ArrowDown } from "lucide-react"; import { useEffect, useMemo, useRef, useState } from "react"; +import type { ThinkingBlockItem } from "./assistant-message"; import { Message } from "./message"; import { MessageActions } from "./message-actions"; import { RunActivityRow } from "./run-activity-row"; @@ -36,10 +37,10 @@ export function MessageList({ }, [activeToolCalls]); const thinkingByMessage = useMemo(() => { - const map = new Map(); - for (const [, tb] of thinkingBlocks) { + const map = new Map(); + for (const [id, tb] of thinkingBlocks) { const existing = map.get(tb.messageId) ?? []; - existing.push(tb); + existing.push({ id, block: tb }); map.set(tb.messageId, existing); } return map; diff --git a/chat-ui/src/components/message.tsx b/chat-ui/src/components/message.tsx index 88c6a91f..d26faf27 100644 --- a/chat-ui/src/components/message.tsx +++ b/chat-ui/src/components/message.tsx @@ -1,25 +1,19 @@ -import type { ChatMessage, ThinkingBlockState, ToolCallState } from "@/lib/chat-types"; -import { AssistantMessage } from "./assistant-message"; +import type { ChatMessage, ToolCallState } from "@/lib/chat-types"; +import { AssistantMessage, type ThinkingBlockItem } from "./assistant-message"; import { UserMessage } from "./user-message"; export function Message({ - message, - toolCalls, - thinkingBlocks, + message, + toolCalls, + thinkingBlocks, }: { - message: ChatMessage; - toolCalls: ToolCallState[]; - thinkingBlocks: ThinkingBlockState[]; + message: ChatMessage; + toolCalls: ToolCallState[]; + thinkingBlocks: ThinkingBlockItem[]; }) { - if (message.role === "user") { - return ; - } + if (message.role === "user") { + return ; + } - return ( - - ); + return ; } diff --git a/chat-ui/src/components/run-activity-row.tsx b/chat-ui/src/components/run-activity-row.tsx index 7c5f8b7b..d589c57f 100644 --- a/chat-ui/src/components/run-activity-row.tsx +++ b/chat-ui/src/components/run-activity-row.tsx @@ -1,8 +1,10 @@ +import { extractToolArtifacts } from "@/lib/chat-artifacts"; import type { RunActivityState, SubagentActivity, ToolCallState } from "@/lib/chat-types"; import { cn } from "@/lib/utils"; import { Activity, AlertCircle, CheckCircle2, Clock3, Loader2, Radio, ShieldAlert } from "lucide-react"; import type { LucideIcon } from "lucide-react"; import { useEffect, useMemo, useState } from "react"; +import { ArtifactTray } from "./artifact-tray"; import { ToolCallCard } from "./tool-call-card"; function statusIcon(activity: RunActivityState): { @@ -106,7 +108,9 @@ function plural(count: number, singular: string): string { function toolFacts(toolCalls: ToolCallState[]): string[] { const running = toolCalls.filter((tool) => tool.state === "running"); const completed = toolCalls.filter((tool) => tool.state === "result"); - const issues = toolCalls.filter((tool) => tool.state === "error" || tool.state === "blocked" || tool.state === "aborted"); + const issues = toolCalls.filter( + (tool) => tool.state === "error" || tool.state === "blocked" || tool.state === "aborted", + ); const facts: string[] = []; if (running.length > 0) { facts.push(`Using ${running.map((tool) => tool.toolName).join(", ")}`); @@ -169,6 +173,7 @@ export function RunActivityRow({ const elapsed = formatElapsed(elapsedAt - Date.parse(activity.startedAt)); const facts = useMemo(() => activityFacts(activity, toolCalls), [activity, toolCalls]); const subagents = useMemo(() => sortedSubagents(activity), [activity]); + const artifacts = useMemo(() => extractToolArtifacts(toolCalls), [toolCalls]); return (
@@ -237,6 +242,8 @@ export function RunActivityRow({ ))}
)} + +
diff --git a/chat-ui/src/lib/__tests__/chat-artifacts.test.ts b/chat-ui/src/lib/__tests__/chat-artifacts.test.ts new file mode 100644 index 00000000..8f693455 --- /dev/null +++ b/chat-ui/src/lib/__tests__/chat-artifacts.test.ts @@ -0,0 +1,105 @@ +import { describe, expect, it } from "vitest"; +import { extractToolArtifacts, formatArtifactSize } from "../chat-artifacts"; +import type { ToolCallState } from "../chat-types"; + +function tool(overrides: Partial): ToolCallState { + return { + id: "tool-1", + messageId: "message-1", + toolName: "mcp__phantom-web-ui__phantom_create_page", + state: "result", + inputJson: "{}", + isMcp: true, + ...overrides, + }; +} + +describe("extractToolArtifacts", () => { + it("extracts created page artifacts from structured tool output", () => { + const artifacts = extractToolArtifacts([ + tool({ + input: { title: "Sendoso vs Reachdesk", path: "sendoso-vs-reachdesk.html" }, + output: JSON.stringify({ + created: true, + path: "sendoso-vs-reachdesk.html", + url: "http://127.0.0.1:3112/ui/sendoso-vs-reachdesk.html", + size: 4221, + }), + }), + ]); + + expect(artifacts).toEqual([ + { + id: "page:http://127.0.0.1:3112/ui/sendoso-vs-reachdesk.html", + type: "page", + title: "Sendoso vs Reachdesk", + url: "http://127.0.0.1:3112/ui/sendoso-vs-reachdesk.html", + path: "sendoso-vs-reachdesk.html", + sizeBytes: 4221, + sourceToolName: "phantom_create_page", + }, + ]); + }); + + it("derives a relative page URL from preview input path", () => { + const artifacts = extractToolArtifacts([ + tool({ + toolName: "phantom_preview_page", + input: { path: "reports/weekly.html" }, + output: JSON.stringify({ status: 200, title: "Weekly Report" }), + }), + ]); + + expect(artifacts[0]?.url).toBe("/ui/reports/weekly.html"); + expect(artifacts[0]?.path).toBe("reports/weekly.html"); + expect(artifacts[0]?.title).toBe("Weekly Report"); + }); + + it("does not turn magic login links into artifacts", () => { + const artifacts = extractToolArtifacts([ + tool({ + toolName: "mcp__phantom-web-ui__phantom_generate_login", + output: JSON.stringify({ + magicLink: "http://127.0.0.1:3112/ui/login?magic=secret", + expiresIn: "10 minutes", + }), + }), + tool({ + output: JSON.stringify({ + url: "http://127.0.0.1:3112/ui/login?magic=secret", + path: "login", + }), + }), + ]); + + expect(artifacts).toEqual([]); + }); + + it("finds safe page URLs from text output and deduplicates them", () => { + const output = "Fetched http://example.com first. Created http://localhost:3100/ui/profile.html. You can share it."; + const artifacts = extractToolArtifacts([tool({ id: "tool-1", output }), tool({ id: "tool-2", output })]); + + expect(artifacts).toHaveLength(1); + expect(artifacts[0]?.url).toBe("http://localhost:3100/ui/profile.html"); + }); + + it("ignores unfinished tools", () => { + const artifacts = extractToolArtifacts([ + tool({ + state: "running", + output: JSON.stringify({ url: "http://localhost:3100/ui/profile.html" }), + }), + ]); + + expect(artifacts).toEqual([]); + }); +}); + +describe("formatArtifactSize", () => { + it("formats byte counts compactly", () => { + expect(formatArtifactSize(undefined)).toBeNull(); + expect(formatArtifactSize(512)).toBe("512 B"); + expect(formatArtifactSize(2048)).toBe("2.0 KB"); + expect(formatArtifactSize(1024 * 1024 * 2.2)).toBe("2.2 MB"); + }); +}); diff --git a/chat-ui/src/lib/chat-artifacts.ts b/chat-ui/src/lib/chat-artifacts.ts new file mode 100644 index 00000000..ab87bf6c --- /dev/null +++ b/chat-ui/src/lib/chat-artifacts.ts @@ -0,0 +1,139 @@ +import type { ToolCallState } from "./chat-types"; + +const PAGE_TOOL_NAMES = new Set(["phantom_create_page", "phantom_preview_page"]); +const MAX_TITLE_LENGTH = 90; + +export type ChatArtifactView = { + id: string; + type: "page"; + title: string; + url: string; + path?: string; + sizeBytes?: number; + sourceToolName: string; +}; + +export function extractToolArtifacts(toolCalls: ToolCallState[]): ChatArtifactView[] { + const artifacts = toolCalls + .map(pageArtifactFromTool) + .filter((artifact): artifact is ChatArtifactView => artifact !== null); + const byKey = new Map(); + for (const artifact of artifacts) { + const key = artifact.url || artifact.path || artifact.id; + byKey.set(key, artifact); + } + return [...byKey.values()]; +} + +export function formatArtifactSize(sizeBytes: number | undefined): string | null { + if (sizeBytes === undefined) return null; + if (sizeBytes < 1024) return `${sizeBytes} B`; + const kib = sizeBytes / 1024; + if (kib < 1024) return `${formatNumber(kib)} KB`; + return `${formatNumber(kib / 1024)} MB`; +} + +function pageArtifactFromTool(tool: ToolCallState): ChatArtifactView | null { + const sourceToolName = normalizePageToolName(tool.toolName); + if (!sourceToolName || tool.state !== "result") return null; + + const input = recordFromUnknown(tool.input) ?? parseJsonRecord(tool.inputJson); + const output = parseJsonRecord(tool.output); + const path = normalizePagePath(stringField(output, "path") ?? stringField(input, "path")); + const url = + normalizePageUrl( + stringField(output, "url") ?? + stringField(output, "publicUrl") ?? + stringField(output, "pageUrl") ?? + urlFromText(tool.output), + ) ?? urlFromPath(path); + if (!url) return null; + + const title = truncate( + stringField(input, "title") ?? stringField(output, "title") ?? path ?? "Created page", + MAX_TITLE_LENGTH, + ); + const sizeBytes = numberField(output, "size"); + return { + id: `page:${url}`, + type: "page", + title, + url, + sourceToolName, + ...(path ? { path } : {}), + ...(sizeBytes !== undefined ? { sizeBytes } : {}), + }; +} + +function normalizePageToolName(toolName: string): string | null { + for (const pageToolName of PAGE_TOOL_NAMES) { + if (toolName === pageToolName || toolName.endsWith(`__${pageToolName}`) || toolName.endsWith(`:${pageToolName}`)) { + return pageToolName; + } + } + return null; +} + +function recordFromUnknown(value: unknown): Record | null { + if (value === null || typeof value !== "object" || Array.isArray(value)) return null; + return value as Record; +} + +function parseJsonRecord(value: string | undefined): Record | null { + if (!value) return null; + try { + return recordFromUnknown(JSON.parse(value)); + } catch { + return null; + } +} + +function stringField(record: Record | null, key: string): string | undefined { + const value = record?.[key]; + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function numberField(record: Record | null, key: string): number | undefined { + const value = record?.[key]; + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function normalizePageUrl(value: string | undefined): string | undefined { + if (!value) return undefined; + const trimmed = stripTrailingPunctuation(value.trim()); + if (!trimmed.includes("/ui/")) return undefined; + if (trimmed.includes("/ui/login") || trimmed.includes("magic=")) return undefined; + return trimmed; +} + +function normalizePagePath(value: string | undefined): string | undefined { + if (!value) return undefined; + const cleaned = value.trim().replace(/^\/+/, "").replace(/^ui\//, ""); + if (!cleaned || cleaned.includes("..") || cleaned.includes("\0") || cleaned.startsWith("login")) return undefined; + return cleaned; +} + +function urlFromPath(path: string | undefined): string | undefined { + return path ? `/ui/${path}` : undefined; +} + +function urlFromText(value: string | undefined): string | undefined { + if (!value) return undefined; + const match = value.match(/(?:https?:\/\/[^\s"']*\/ui\/[^\s"']+|\/ui\/[^\s"']+)/); + return normalizePageUrl(match?.[0]); +} + +function stripTrailingPunctuation(value: string): string { + return value.replace(/[),.;]+$/g, ""); +} + +function truncate(value: string, maxLength: number): string { + if (value.length <= maxLength) return value; + return `${value.slice(0, maxLength - 3)}...`; +} + +function formatNumber(value: number): string { + return value < 10 ? value.toFixed(1) : Math.round(value).toString(); +} diff --git a/prompts/phase-10k-page-artifact-ui-builder.md b/prompts/phase-10k-page-artifact-ui-builder.md new file mode 100644 index 00000000..8f71c8b2 --- /dev/null +++ b/prompts/phase-10k-page-artifact-ui-builder.md @@ -0,0 +1,52 @@ +ultrathink. ultrathink. ultrathink. + +You are acting as principal engineer, principal architect, and principal product manager for Phantom's chat experience on Murph. + +Context: +- Branch: codex/chat-experience-polish-10j in /Users/truffle/work/phantom-murph-hardening. +- The open upstream PR #113 has been updated to current origin/main and is review-blocked only. +- Phase 10J has already implemented default-collapsed successful tool cards. +- The operator's current priority is best-in-class chat: durable run timeline, richer progress, file/artifact preview, markdown polish, and reliable long-running Murph conversations. +- We must not fake private chain-of-thought. We show only safe progress and existing runtime facts. +- We must not reinvent where Phantom already has primitives. + +Relevant current code: +- chat-ui/src/components/assistant-message.tsx renders thinking blocks, tool cards, markdown, and cost metadata. +- chat-ui/src/components/tool-call-card.tsx renders tool inputs and outputs. +- chat-ui/src/lib/chat-types.ts defines ToolCallState. +- src/chat/continuity-context.ts already extracts page artifacts from phantom_create_page and phantom_preview_page tool calls for post-compaction continuity. + +Task: +Implement the smallest production-grade artifact UI slice: +1. Add a client-side artifact extractor that derives safe, user-visible created page artifacts from existing ToolCallState data. +2. Reuse the same semantics as src/chat/continuity-context.ts where practical: + - Recognize phantom_create_page and phantom_preview_page. + - Prefer output url/publicUrl/pageUrl, then safe URL from text. + - Use input path/title when available. + - Include size when available. + - Exclude /ui/login and magic-login links. + - Deduplicate by URL or path. +3. Add a compact artifact card/tray in chat-ui that renders under the assistant answer when artifacts exist. +4. Keep this as a UI affordance. Do not add a new MCP tool or Phantom built-in for this slice. +5. Add focused tests for extraction policy. +6. Maintain strict TypeScript. No explicit any, no @ts-ignore, no em dashes, no emojis. +7. Keep files small and components focused. + +Acceptance criteria: +- Successful phantom_create_page results produce a visible artifact card with title, path/URL, size when known, Open, and Copy URL actions. +- phantom_generate_login output never becomes a created artifact. +- Relative and absolute /ui/ page URLs work; /ui/login does not. +- Duplicate tool frames do not duplicate artifact cards. +- Existing tool card tests and chat store tests remain green. +- Full local gates pass before commit. +- Live verification with Phantom on Murph/OpenAI creates a page and shows the artifact card. + +Anti-patterns: +- Do not parse arbitrary secrets into visible cards. +- Do not surface magic links as artifacts. +- Do not create a new server schema before proving the UI affordance. +- Do not bury the final answer under oversized artifact chrome. +- Do not make the card depend on hover-only controls. + +Self-review: +After implementation, review the diff for overreach, accessibility, long URLs, mobile fit, and whether this should have reused an existing helper. diff --git a/research/chat-experience/phase-10k-page-artifact-ui.md b/research/chat-experience/phase-10k-page-artifact-ui.md new file mode 100644 index 00000000..1b54e08d --- /dev/null +++ b/research/chat-experience/phase-10k-page-artifact-ui.md @@ -0,0 +1,40 @@ +# Phase 10K Page Artifact UI + +Date: 2026-05-01 + +## Scope + +This slice adds a UI affordance for page artifacts that Phantom already creates +through `phantom_create_page` and `phantom_preview_page`. It does not add a new +MCP tool, built-in tool, or server wire format. + +## Why This Belongs In The UI First + +The durable continuity layer already extracts page artifacts in +`src/chat/continuity-context.ts` so the agent can recover them after compaction. +That means the runtime already has enough evidence to distinguish a created page +from an authentication link. The chat client can use the same facts to make the +created page visible to the user without asking the agent to repeat itself. + +## Policy + +- Recognize `phantom_create_page` and `phantom_preview_page`. +- Prefer JSON output fields `url`, `publicUrl`, and `pageUrl`. +- Use input `path` and `title` when present. +- Accept absolute or relative `/ui/` page URLs. +- Exclude `/ui/login` and links with magic login tokens. +- Deduplicate by URL, then path, then tool id. +- Keep `phantom_generate_login` out of artifact extraction. + +## Product Shape + +Render a compact artifact tray under the assistant answer. Each card should show +the artifact type, title, path or URL, optional size, and two direct actions: +Open and Copy URL. The card is a browser affordance for already-known state, so +it must stay small and should not bury the final answer. + +## Future Work + +General file previews need a richer server contract for safe path metadata, +preview kind, redaction status, and retention. This page-artifact slice is the +smallest useful step because created pages already return structured JSON. From 777f927b1c4109e75d8fc99e4aeed664082c4544 Mon Sep 17 00:00:00 2001 From: Truffle Date: Fri, 1 May 2026 06:08:06 -0700 Subject: [PATCH 6/9] Harden Murph chat continuity context --- .../phase-10l-murph-continuity-researcher.md | 73 ++++++++++++++ ...phase-10l-phantom-continuity-researcher.md | 70 +++++++++++++ .../phase-10l-continuity-hardening.md | 98 +++++++++++++++++++ .../agent-sdk-boundary-callers.test.ts | 43 ++++++++ src/agent/__tests__/murph-context.test.ts | 34 +++++++ src/agent/chat-query.ts | 16 ++- src/agent/murph-context.ts | 41 +++++--- src/agent/runtime.ts | 7 +- src/chat/__tests__/continuity-context.test.ts | 29 +++++- src/chat/__tests__/writer.test.ts | 60 +++++++++++- src/chat/continuity-context.ts | 36 +++++-- src/chat/writer.ts | 11 ++- 12 files changed, 486 insertions(+), 32 deletions(-) create mode 100644 prompts/phase-10l-murph-continuity-researcher.md create mode 100644 prompts/phase-10l-phantom-continuity-researcher.md create mode 100644 research/chat-experience/phase-10l-continuity-hardening.md diff --git a/prompts/phase-10l-murph-continuity-researcher.md b/prompts/phase-10l-murph-continuity-researcher.md new file mode 100644 index 00000000..64e98436 --- /dev/null +++ b/prompts/phase-10l-murph-continuity-researcher.md @@ -0,0 +1,73 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are researching Phantom on Murph +post-compaction continuity. Do not edit files. Do not summarize from memory. +Read the actual Murph and Phantom code. + +Context: + +- Phantom is being moved onto Murph, a clean-room pi-mono based agent runtime. +- We already implemented Phantom transcript recovery through + `phantom_chat_transcript_search`. +- Real browser testing showed long-running tool-heavy tasks work, and Murph + compaction appears to fire, but after compaction the agent may lose the + created-page answer or stop responding usefully. +- The operator believes compaction should be an LLM-written compact summary, + not literal history replay. The next model call should receive that summary + plus enough pointers and tools to continue. +- We must reuse pi-mono/Pi capabilities where they exist and avoid rebuilding + battle-tested primitives in worse form. + +Reading list: + +1. `/Users/truffle/work/murph/AGENTS.md` +2. `/Users/truffle/work/murph/QUALITY-BAR.md` +3. `/Users/truffle/work/murph/PROGRESS.md` +4. `/Users/truffle/work/murph/src/**` areas related to context overflow, + compaction, provider adapters, message history, tool call preservation, and + summaries. +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10g-pi-continuity.md` +6. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-implementation.md` +7. Any local pi-mono, pi-code, or notes repo already cloned under + `/Users/truffle/work` that contains relevant compaction or overflow logic. + +Questions to answer: + +1. What exactly does Murph do today when context approaches or exceeds the + provider limit? +2. Does Murph use pi-mono/Pi primitives for context overflow detection, + summarization, and retry, or did we duplicate that logic? +3. After compaction, what is sent to the next provider call? Include the shape + of summary, preserved tool-call state, and any continuation instructions. +4. Is there an existing Murph seam for "continue the interrupted turn after + compaction", or is compaction only a pre-call trimming step? +5. Which provider-specific constraints matter for OpenAI, Anthropic, and ZAI + around tool-call history after compaction? +6. What is the smallest production-grade fix that improves continuation + without corrupting tool-call protocols? +7. What tests should prove this, preferably with real providers when possible + and deterministic unit tests for the core state machine? + +Deliverables: + +- A concise architecture note with file references and line numbers. +- A clear verdict: reuse Pi/Murph existing primitives, fix Murph primitive, or + add Phantom-specific continuation glue. +- A ranked implementation plan with acceptance criteria. +- Explicit risks and anti-patterns, especially anything that would break + OpenAI tool-call message ordering. + +Anti-patterns: + +- Do not suggest raw transcript stuffing. +- Do not suggest regex-heavy summarization. +- Do not expose private chain-of-thought. +- Do not preserve raw secrets, magic links, auth headers, or full tool payloads + in summaries. +- Do not treat a single reference repo as authoritative. + +Self-review: + +Before finalizing, re-read your own findings and remove any claim not backed by +files you actually inspected. diff --git a/prompts/phase-10l-phantom-continuity-researcher.md b/prompts/phase-10l-phantom-continuity-researcher.md new file mode 100644 index 00000000..bb7bdb5b --- /dev/null +++ b/prompts/phase-10l-phantom-continuity-researcher.md @@ -0,0 +1,70 @@ +ultrathink. ultrathink. ultrathink. + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are researching Phantom's web chat +behavior around long-running Murph sessions, compaction, replay, and user +perception. Do not edit files. Read actual files and, if useful, inspect local +SQLite state from the recent test sessions. + +Context: + +- Phantom now runs locally on Murph with real OpenAI successfully. +- Tool cards, durable timeline, transcript search, and artifact cards exist. +- The operator observed a real long-running test where the agent created a + page, compaction seemed to work, then follow-up questions about the page link + got confused or stopped responding. +- The UI also needs to communicate long-running work better, but the immediate + risk is whether the agent run state and transcript recovery are correct after + compaction. + +Reading list: + +1. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +2. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +3. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10i-transcript-recovery-implementation.md` +4. `/Users/truffle/work/phantom-murph-hardening/src/chat/**` +5. `/Users/truffle/work/phantom-murph-hardening/src/agent/**` +6. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/**` areas related + to status, run activity, messages, and tool rendering. +7. Recent local SQLite chat/run event data if available and safe to inspect. + +Questions to answer: + +1. How does a new user message in web chat become an AgentRuntime query, and + how is the previous Murph session id reused? +2. Does Phantom send enough post-compaction context for the agent to recover + created artifacts, page paths, and prior task state? +3. Does Phantom have a concept of "agent compacted, now continue the same task", + or does every user message start a fresh query that depends on Murph memory? +4. Are durable timeline events sufficient to reconstruct what happened for the + user and for the agent, or are they UI-only? +5. Where can the UI truthfully show compaction, recovery, waiting, and retry + states with current wire events? +6. What caused the observed "done but confused about the created page link" + failure mode most likely: Murph summary quality, Phantom prompt/context, + transcript search tool availability, UI state, or agent behavior? +7. What is the smallest testable Phantom-side fix, if any, before changing + Murph internals? + +Deliverables: + +- A concise note with file references and line numbers. +- One recommended next implementation slice, including exact tests and browser + verification. +- A list of facts the agent should have after compaction and where each should + live: Murph summary, Phantom transcript search, durable artifact record, or + UI-only state. + +Anti-patterns: + +- Do not invent a separate hidden memory store if the transcript or artifact + record is the right source of truth. +- Do not make the UI pretend work is happening after the backend is done. +- Do not expose secrets or magic-login links in summaries, tools, artifacts, or + persisted timeline displays. +- Do not add a Phantom-specific hack if the right fix belongs in Murph. + +Self-review: + +Before finalizing, separate confirmed facts from hypotheses and note what would +prove or disprove each hypothesis. diff --git a/research/chat-experience/phase-10l-continuity-hardening.md b/research/chat-experience/phase-10l-continuity-hardening.md new file mode 100644 index 00000000..9ed0a14e --- /dev/null +++ b/research/chat-experience/phase-10l-continuity-hardening.md @@ -0,0 +1,98 @@ +# Phase 10L Continuity Hardening + +Date: 2026-05-01 + +## Goal + +Fix the observed post-compaction page-link confusion without inventing a +parallel Phantom memory runtime or weakening provider tool-call protocols. + +## Research Verdict + +Two read-only researchers inspected Phantom, Murph, Pi, and local event data. + +Confirmed: + +- Phantom web chat starts a fresh query per user message and resumes the same + Murph session through `sessions.sdk_session_id`. +- Murph uses Pi correctly for provider transport, context transforms, overflow + recognition, and provider-specific tool-message conversion. +- Murph compaction is an LLM summary plus a protected suffix, with one + compact-and-retry path after provider overflow. +- The remaining Murph-side gap is semantic summary quality for the active turn, + not provider protocol mechanics. +- Phantom had a concrete continuity bug: real stream events store MCP-qualified + page tool names such as `mcp__phantom-web-ui__phantom_create_page`, while the + host context extractor only recognized exact `phantom_create_page`. + +## Shipped Phantom Slice + +1. `buildChatContinuityContext(...)` now normalizes MCP-qualified page tool + names for `phantom_create_page` and `phantom_preview_page`. +2. Page path and URL extraction now rejects login and magic-link surfaces, + strips trailing punctuation, supports relative `/ui/...` URLs, and normalizes + paths with leading `/ui/`. +3. Phantom now passes a live `sessionContextProvider` through + `ChatSessionWriter -> AgentRuntime.runForChat -> executeChatQuery`. +4. Murph-only `transformContext` calls the provider each time Pi invokes the + transform, replacing stale `` messages instead of + accumulating them. +5. Anthropic fallback still receives a run-start context string through the + existing system-prompt append path. + +## Why This Matters + +The run-start context cannot include page artifacts created later in the same +long run. Murph can compact and retry inside that same query. A live provider +lets a later transform see newer durable stream events without changing the +wire format or stuffing raw transcript data into the model. + +## Tests + +Focused tests cover: + +- MCP-qualified page creation and preview extraction. +- Magic-login exclusion and safe path normalization. +- Lazy context provider invocation on every transform call. +- Stale Phantom context replacement. +- Writer-level proof that a tool event emitted during a run appears in a later + provider call. +- Agent SDK boundary proof that Murph receives `transformContext` while + Anthropic fallback keeps prompt append behavior. + +Commands run: + +```sh +bun test src/agent/__tests__/murph-context.test.ts src/chat/__tests__/continuity-context.test.ts src/chat/__tests__/writer.test.ts src/agent/__tests__/agent-sdk-boundary-callers.test.ts +bun run typecheck +bun run lint +bun test +git diff --check +``` + +Live OpenAI verification: + +- Ran Phantom locally on Murph/OpenAI `gpt-5.5` at `http://127.0.0.1:3100`. +- Created chat session `e974d07a-6dd4-4c5b-a032-ef3cbe7d4f23`. +- Asked the agent to create `continuity-link-proof-10l.html` with + `phantom_create_page`. +- Real stream events stored the page tool as + `mcp__phantom-web-ui__phantom_create_page` and preview as + `mcp__phantom-preview__phantom_preview_page`. +- `phantom_preview_page` returned HTTP 200 with title + `Continuity Link Proof 10L - Phantom`. +- A follow-up asking for the just-created page URL returned + `http://127.0.0.1:3100/ui/continuity-link-proof-10l.html`, not a login link. +- The generated test page was removed from the worktree after verification. + +## Remaining Work + +1. Murph should harden its generic compaction summary prompt with an explicit + current-turn continuation section: task intent, important tool outcomes, + created resources, paths, URLs, errors, and next action. +2. Phantom should promote created page and file artifacts into durable + first-class records. `chat_stream_events` are swept, so event-log context is + useful but not enough as the long-term artifact source. +3. The UI should surface retry/recovery state when Murph exposes such events. + Today it can show compaction and durable run activity, but not an explicit + compact-and-retry status. diff --git a/src/agent/__tests__/agent-sdk-boundary-callers.test.ts b/src/agent/__tests__/agent-sdk-boundary-callers.test.ts index de86cb13..7a31b60d 100644 --- a/src/agent/__tests__/agent-sdk-boundary-callers.test.ts +++ b/src/agent/__tests__/agent-sdk-boundary-callers.test.ts @@ -412,6 +412,49 @@ describe("Agent SDK boundary callers", () => { expect(textFromContent(contextMessage.content)).toContain("http://127.0.0.1:3112/ui/profile.html"); }); + test("chat query path passes a lazy Phantom continuity provider to Murph transformContext", async () => { + __setAgentSdkQueryForTests((params) => { + calls.push(params); + return queryFromMessages([initMessage(), assistantMessage("chat assistant"), resultMessage("chat result")]); + }); + + let context = "User-visible page: http://127.0.0.1:3112/ui/first.html"; + await executeChatQuery( + { + config: makeConfig({ + agent_runtime: "murph", + model: "gpt-5.5", + provider: { type: "openai" }, + }), + sessionStore: new SessionStore(db), + costTracker: new CostTracker(db), + memoryContextBuilder: null, + evolvedConfig: null, + roleTemplate: null, + onboardingPrompt: null, + mcpServerFactories: null, + }, + "web:chat-session", + { role: "user", content: "give me the page link" }, + Date.now(), + { + signal: new AbortController().signal, + sessionContextProvider: () => context, + onSdkEvent: () => {}, + }, + ); + + const options = calls[0]?.options as AgentSdkQueryOptions | undefined; + const transformContext = options?.transformContext; + expect(transformContext).toBeDefined(); + const first = (await transformContext?.([{ role: "user", content: "same prompt" }])) ?? []; + context = "User-visible page: http://127.0.0.1:3112/ui/second.html"; + const second = (await transformContext?.(first)) ?? []; + + expect(JSON.stringify(second)).toContain("http://127.0.0.1:3112/ui/second.html"); + expect(JSON.stringify(second)).not.toContain("http://127.0.0.1:3112/ui/first.html"); + }); + test("chat query retries stale resume result frames without forwarding the error result", async () => { const sdkEvents: SDKMessage[] = []; let factoryCalls = 0; diff --git a/src/agent/__tests__/murph-context.test.ts b/src/agent/__tests__/murph-context.test.ts index 87c19c2a..ba3d7df8 100644 --- a/src/agent/__tests__/murph-context.test.ts +++ b/src/agent/__tests__/murph-context.test.ts @@ -34,6 +34,40 @@ describe("createMurphContextTransform", () => { expect(output).not.toContain(staleContext); }); + test("rebuilds context from a provider on each transform invocation", async () => { + let calls = 0; + const transform = createMurphContextTransform(() => { + calls += 1; + return `Fresh context ${calls}`; + }); + expect(transform).toBeDefined(); + + const first = await transform?.([{ role: "user", content: "first" }]); + const second = await transform?.(first ?? []); + + expect(calls).toBe(2); + expect(textContent(first?.[0])).toContain("Fresh context 1"); + const contexts = (second ?? []).filter((message) => textContent(message).includes("")); + expect(contexts).toHaveLength(1); + expect(textContent(contexts[0])).toContain("Fresh context 2"); + expect(textContent(second?.[0])).not.toContain("Fresh context 1"); + }); + + test("removes stale context when the provider returns empty context", async () => { + const staleContext = { + role: "user", + content: [{ type: "text", text: "\nStale context\n" }], + timestamp: 1, + }; + const transform = createMurphContextTransform(() => " "); + + const output = (await transform?.([staleContext, { role: "user", content: "next" }])) ?? []; + + expect(output).toHaveLength(1); + expect(textContent(output[0])).toBe("next"); + expect(record(output[0])?.role).toBe("user"); + }); + test("returns undefined for empty context", () => { expect(createMurphContextTransform(" ")).toBeUndefined(); expect(createMurphContextTransform(undefined)).toBeUndefined(); diff --git a/src/agent/chat-query.ts b/src/agent/chat-query.ts index fdf0e4e7..4bba2a90 100644 --- a/src/agent/chat-query.ts +++ b/src/agent/chat-query.ts @@ -33,12 +33,19 @@ export type ChatQueryDeps = { mcpServerFactories: Record | null; }; +type SessionContextProvider = () => string | undefined; + export async function executeChatQuery( deps: ChatQueryDeps, sessionKey: string, message: MessageParam, startTime: number, - options: { signal: AbortSignal; onSdkEvent: (msg: SDKMessage) => void; sessionContext?: string }, + options: { + signal: AbortSignal; + onSdkEvent: (msg: SDKMessage) => void; + sessionContext?: string; + sessionContextProvider?: SessionContextProvider; + }, ): Promise { const parts = sessionKey.split(":"); const channelId = parts[0] ?? "web"; @@ -58,6 +65,7 @@ export async function executeChatQuery( } } const useMurphContextTransform = deps.config.agent_runtime === "murph"; + const initialSessionContext = options.sessionContextProvider?.() ?? options.sessionContext; const appendPrompt = assemblePrompt( deps.config, memoryContext, @@ -65,9 +73,11 @@ export async function executeChatQuery( deps.roleTemplate ?? undefined, deps.onboardingPrompt ?? undefined, undefined, - useMurphContextTransform ? undefined : options.sessionContext, + useMurphContextTransform ? undefined : initialSessionContext, ); - const transformContext = useMurphContextTransform ? createMurphContextTransform(options.sessionContext) : undefined; + const transformContext = useMurphContextTransform + ? createMurphContextTransform(options.sessionContextProvider ?? initialSessionContext) + : undefined; const queryModel = resolveAgentRuntimeModel(deps.config, deps.config.model); const providerEnv = buildAgentRuntimeEnv(deps.config, queryModel); diff --git a/src/agent/murph-context.ts b/src/agent/murph-context.ts index 231086eb..ec6da4d6 100644 --- a/src/agent/murph-context.ts +++ b/src/agent/murph-context.ts @@ -1,4 +1,5 @@ export type MurphContextTransform = (messages: unknown[], signal?: AbortSignal) => Promise | unknown[]; +export type MurphContextSource = string | undefined | (() => string | undefined | Promise); const PHANTOM_CONTEXT_OPEN_TAG = ""; const PHANTOM_CONTEXT_CLOSE_TAG = ""; @@ -9,25 +10,35 @@ type PhantomContextMessage = { timestamp: number; }; -export function createMurphContextTransform(context: string | undefined): MurphContextTransform | undefined { - const trimmed = context?.trim(); - if (!trimmed) return undefined; +export function createMurphContextTransform(context: MurphContextSource): MurphContextTransform | undefined { + if (typeof context !== "function") { + const trimmed = context?.trim(); + if (!trimmed) return undefined; + return (messages: unknown[]) => injectContext(messages, trimmed); + } - return (messages: unknown[]) => { + return async (messages: unknown[]) => { const cleaned = messages.filter((message) => !isPhantomContextMessage(message)); - const contextMessage = buildContextMessage(trimmed); - if (cleaned.length === 0) { - return [contextMessage]; - } + const trimmed = (await context())?.trim(); + if (!trimmed) return cleaned; + return injectContext(cleaned, trimmed); + }; +} - const lastIndex = cleaned.length - 1; - const lastMessage = cleaned[lastIndex]; - if (hasRole(lastMessage, "user")) { - return [...cleaned.slice(0, lastIndex), contextMessage, lastMessage]; - } +function injectContext(messages: unknown[], context: string): unknown[] { + const cleaned = messages.filter((message) => !isPhantomContextMessage(message)); + const contextMessage = buildContextMessage(context); + if (cleaned.length === 0) { + return [contextMessage]; + } - return [...cleaned, contextMessage]; - }; + const lastIndex = cleaned.length - 1; + const lastMessage = cleaned[lastIndex]; + if (hasRole(lastMessage, "user")) { + return [...cleaned.slice(0, lastIndex), contextMessage, lastMessage]; + } + + return [...cleaned, contextMessage]; } function buildContextMessage(content: string): PhantomContextMessage { diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts index 181bf6da..bb72d130 100644 --- a/src/agent/runtime.ts +++ b/src/agent/runtime.ts @@ -127,7 +127,12 @@ export class AgentRuntime { async runForChat( sessionKey: string, message: MessageParam, - options: { signal: AbortSignal; onSdkEvent: (msg: SDKMessage) => void; sessionContext?: string }, + options: { + signal: AbortSignal; + onSdkEvent: (msg: SDKMessage) => void; + sessionContext?: string; + sessionContextProvider?: () => string | undefined; + }, ): Promise { if (this.activeSessions.has(sessionKey)) { return { text: "Error: session busy", sessionId: "", cost: emptyCost(), durationMs: 0 }; diff --git a/src/chat/__tests__/continuity-context.test.ts b/src/chat/__tests__/continuity-context.test.ts index f3e40c34..1ea095a9 100644 --- a/src/chat/__tests__/continuity-context.test.ts +++ b/src/chat/__tests__/continuity-context.test.ts @@ -28,7 +28,7 @@ describe("buildChatContinuityContext", () => { eventLog.append(session.id, null, 1, "message.tool_call_start", { event: "message.tool_call_start", tool_call_id: "tool-1", - tool_name: "phantom_create_page", + tool_name: "mcp__phantom-web-ui__phantom_create_page", message_id: "assistant-1", parent_tool_use_id: null, is_mcp: true, @@ -44,7 +44,7 @@ describe("buildChatContinuityContext", () => { eventLog.append(session.id, null, 3, "message.tool_call_result", { event: "message.tool_call_result", tool_call_id: "tool-1", - tool_name: "phantom_create_page", + tool_name: "mcp__phantom-web-ui__phantom_create_page", status: "success", output: JSON.stringify({ path: "muhammad-ahmed-cheema.html", @@ -61,9 +61,34 @@ describe("buildChatContinuityContext", () => { expect(context).toContain("Muhammad Ahmed Cheema Profile"); expect(context).toContain("http://127.0.0.1:3112/ui/muhammad-ahmed-cheema.html"); expect(context).toContain("muhammad-ahmed-cheema.html"); + expect(context).toContain("via phantom_create_page"); expect(context).not.toContain("/ui/login"); }); + test("extracts relative page links from MCP-qualified preview output text", () => { + const session = sessionStore.create(); + eventLog.append(session.id, null, 1, "message.tool_call_input_end", { + event: "message.tool_call_input_end", + tool_call_id: "tool-preview", + input: { + path: "reports/weekly.html", + }, + }); + eventLog.append(session.id, null, 2, "message.tool_call_result", { + event: "message.tool_call_result", + tool_call_id: "tool-preview", + tool_name: "mcp__phantom-preview__phantom_preview_page", + status: "success", + output: "Previewed /ui/reports/weekly.html.", + }); + + const context = buildChatContinuityContext({ sessionId: session.id, eventLog }); + + expect(context).toContain("reports/weekly.html"); + expect(context).toContain("/ui/reports/weekly.html"); + expect(context).toContain("via phantom_preview_page"); + }); + test("skips login links and keeps recent compact checkpoints", () => { const session = sessionStore.create(); eventLog.append(session.id, null, 1, "session.compact_boundary", { diff --git a/src/chat/__tests__/writer.test.ts b/src/chat/__tests__/writer.test.ts index f1fc5aba..1aa07894 100644 --- a/src/chat/__tests__/writer.test.ts +++ b/src/chat/__tests__/writer.test.ts @@ -40,7 +40,12 @@ function mockRuntime(overrides?: { runForChat?: ( key: string, msg: unknown, - opts: { signal: AbortSignal; onSdkEvent: (msg: unknown) => void; sessionContext?: string }, + opts: { + signal: AbortSignal; + onSdkEvent: (msg: unknown) => void; + sessionContext?: string; + sessionContextProvider?: () => string | undefined; + }, ) => Promise<{ text: string; sessionId: string; @@ -480,6 +485,59 @@ describe("ChatSessionWriter", () => { expect(capturedContext).toContain("http://127.0.0.1:3112/ui/profile.html"); }); + test("passes a live continuity context provider into the chat runtime", async () => { + const session = sessionStore.create(); + let providerContext: string | undefined; + + const writer = new ChatSessionWriter({ + sessionId: session.id, + runtime: mockRuntime({ + runForChat: async (_key, _message, opts) => { + opts.onSdkEvent({ + type: "tool_progress", + tool_use_id: "tool-late-page", + tool_name: "mcp__phantom-web-ui__phantom_create_page", + phase: "completed", + output_preview: JSON.stringify({ + title: "Late Page", + path: "late-page.html", + url: "http://127.0.0.1:3112/ui/late-page.html", + }), + duration_ms: 25, + }); + providerContext = opts.sessionContextProvider?.(); + opts.onSdkEvent({ + type: "result", + subtype: "success", + result: "ok", + stop_reason: "end_turn", + total_cost_usd: 0, + usage: {}, + modelUsage: {}, + duration_ms: 0, + num_turns: 1, + }); + return { + text: "ok", + sessionId: "sdk-1", + cost: { totalUsd: 0, inputTokens: 0, outputTokens: 0, modelUsage: {} }, + durationMs: 0, + }; + }, + }), + eventLog, + messageStore, + sessionStore, + streamBus, + }); + writer.claim(); + + await writer.run({ role: "user", content: "create a late page" }, "t1", "create a late page"); + + expect(providerContext).toContain("Late Page"); + expect(providerContext).toContain("http://127.0.0.1:3112/ui/late-page.html"); + }); + test("persists errored run timeline without committing assistant id", async () => { const session = sessionStore.create(); const writer = new ChatSessionWriter({ diff --git a/src/chat/continuity-context.ts b/src/chat/continuity-context.ts index 84ecc75e..4b0a6274 100644 --- a/src/chat/continuity-context.ts +++ b/src/chat/continuity-context.ts @@ -127,11 +127,12 @@ function renderContext(input: { } function artifactFromTool(tool: ToolAccumulator): PageArtifact | undefined { - if (!tool.toolName || !PAGE_TOOLS.has(tool.toolName)) return undefined; + const toolName = normalizePageToolName(tool.toolName); + if (!toolName) return undefined; const input = recordFromUnknown(tool.input); const output = parseJsonRecord(tool.output); - const path = stringField(output, "path") ?? stringField(input, "path"); + const path = normalizePagePath(stringField(output, "path") ?? stringField(input, "path")); const url = normalizePageUrl( stringField(output, "url") ?? stringField(output, "publicUrl") ?? @@ -144,7 +145,7 @@ function artifactFromTool(tool: ToolAccumulator): PageArtifact | undefined { const size = numberField(output, "size"); return { seq: tool.seq, - toolName: tool.toolName, + toolName, label: truncate(title, MAX_LABEL_LENGTH), ...(url ? { url } : {}), ...(path ? { path } : {}), @@ -152,6 +153,16 @@ function artifactFromTool(tool: ToolAccumulator): PageArtifact | undefined { }; } +function normalizePageToolName(toolName: string | undefined): string | undefined { + if (!toolName) return undefined; + for (const pageToolName of PAGE_TOOLS) { + if (toolName === pageToolName || toolName.endsWith(`__${pageToolName}`) || toolName.endsWith(`:${pageToolName}`)) { + return pageToolName; + } + } + return undefined; +} + function dedupeArtifacts(artifacts: PageArtifact[]): PageArtifact[] { const byKey = new Map(); for (const artifact of artifacts) { @@ -199,18 +210,31 @@ function numberField(record: Record | undefined, key: string): } function normalizePageUrl(url: string | undefined): string | undefined { - if (!url || !url.includes("/ui/") || url.includes("/ui/login")) { + const trimmed = stripTrailingPunctuation(url?.trim() ?? ""); + if (!trimmed || !trimmed.includes("/ui/") || trimmed.includes("/ui/login") || trimmed.includes("magic=")) { + return undefined; + } + return trimmed; +} + +function normalizePagePath(path: string | undefined): string | undefined { + const cleaned = path?.trim().replace(/^\/+/, "").replace(/^ui\//, ""); + if (!cleaned || cleaned.includes("..") || cleaned.includes("\0") || cleaned.startsWith("login")) { return undefined; } - return url; + return cleaned; } function urlFromText(text: string | undefined): string | undefined { if (!text) return undefined; - const match = text.match(/https?:\/\/[^\s"']+\/ui\/[^\s"']+/); + const match = text.match(/https?:\/\/[^\s"']+\/ui\/[^\s"']+|\/ui\/[^\s"']+/); return normalizePageUrl(match?.[0]); } +function stripTrailingPunctuation(value: string): string { + return value.replace(/[),.;]+$/g, ""); +} + function truncate(value: string, maxLength: number): string { if (value.length <= maxLength) return value; return `${value.slice(0, maxLength - 3)}...`; diff --git a/src/chat/writer.ts b/src/chat/writer.ts index 806331ef..dc0a44e8 100644 --- a/src/chat/writer.ts +++ b/src/chat/writer.ts @@ -122,13 +122,16 @@ export class ChatSessionWriter { let terminalErrorMessage: string | null = null; try { - const sessionContext = buildChatContinuityContext({ - sessionId: this.deps.sessionId, - eventLog: this.deps.eventLog, - }); + const sessionContextProvider = () => + buildChatContinuityContext({ + sessionId: this.deps.sessionId, + eventLog: this.deps.eventLog, + }); + const sessionContext = sessionContextProvider(); const response = await this.deps.runtime.runForChat(sessionKey, message, { signal: this.abortController.signal, sessionContext, + sessionContextProvider, onSdkEvent: (sdkMsg: unknown) => { const frames = translateSdkMessage(sdkMsg as Record, ctx); for (const frame of frames) { From 8c6323625ed5a1c3cf46a2948020b0af7d813c9a Mon Sep 17 00:00:00 2001 From: Truffle Date: Fri, 1 May 2026 06:47:55 -0700 Subject: [PATCH 7/9] Add local chat UI build script --- .gitignore | 1 + package.json | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 1fca3d65..f36b8462 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ node_modules/ dist/ +public/chat/ data/ .env.* .env.local diff --git a/package.json b/package.json index 3a2ef267..97441cab 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "start": "bun run src/index.ts", "dev": "bun run --watch src/index.ts", "phantom": "bun run src/cli/main.ts", + "build:chat-ui": "cd chat-ui && bun run build && rm -rf ../public/chat && cp -R dist ../public/chat", "lint": "biome check src/", "lint:fix": "biome check --write src/", "typecheck": "tsc --noEmit", From aaccf8a69820260eb81b98235954b53afd3df638 Mon Sep 17 00:00:00 2001 From: Truffle Date: Fri, 1 May 2026 07:27:35 -0700 Subject: [PATCH 8/9] Run chat UI gates in CI --- .github/workflows/ci.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4858e553..8b799e3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,3 +24,18 @@ jobs: - name: Test run: bun test + + - name: Install Chat UI Dependencies + working-directory: chat-ui + run: bun install --frozen-lockfile + + - name: Chat UI Typecheck + working-directory: chat-ui + run: bun run typecheck + + - name: Chat UI Test + working-directory: chat-ui + run: bun test + + - name: Chat UI Build + run: bun run build:chat-ui From 83374b4e11fe17148044c72a744bc9d38d1cc769 Mon Sep 17 00:00:00 2001 From: Truffle Date: Fri, 1 May 2026 07:58:25 -0700 Subject: [PATCH 9/9] Preserve durable chat artifacts --- chat-ui/src/components/artifact-tray.tsx | 2 +- chat-ui/src/components/message-list.tsx | 6 +- chat-ui/src/components/run-activity-row.tsx | 11 +- chat-ui/src/components/tool-call-card.tsx | 2 +- chat-ui/src/hooks/__tests__/use-chat.test.ts | 97 +++++++++++ chat-ui/src/hooks/use-chat.ts | 11 +- .../src/lib/__tests__/chat-activity.test.ts | 45 ++++++ .../src/lib/__tests__/chat-artifacts.test.ts | 17 +- chat-ui/src/lib/chat-activity.ts | 2 +- chat-ui/src/lib/chat-artifacts.ts | 23 +-- chat-ui/src/lib/chat-types.ts | 12 ++ chat-ui/src/lib/timeline-view.ts | 17 ++ .../phase-10m-artifact-memory-architecture.md | 95 +++++++++++ prompts/phase-10m-chat-ui-polish-review.md | 86 ++++++++++ .../phase-10m-pi-murph-continuity-research.md | 93 +++++++++++ src/chat/__tests__/continuity-context.test.ts | 52 ++++++ src/chat/__tests__/run-timeline.test.ts | 85 ++++++++++ src/chat/continuity-context.ts | 37 ++++- src/chat/run-timeline.ts | 151 ++++++++++++++++++ src/chat/writer.ts | 1 + 20 files changed, 812 insertions(+), 33 deletions(-) create mode 100644 chat-ui/src/hooks/__tests__/use-chat.test.ts create mode 100644 chat-ui/src/lib/__tests__/chat-activity.test.ts create mode 100644 chat-ui/src/lib/timeline-view.ts create mode 100644 prompts/phase-10m-artifact-memory-architecture.md create mode 100644 prompts/phase-10m-chat-ui-polish-review.md create mode 100644 prompts/phase-10m-pi-murph-continuity-research.md diff --git a/chat-ui/src/components/artifact-tray.tsx b/chat-ui/src/components/artifact-tray.tsx index a0ba9f67..82af8b62 100644 --- a/chat-ui/src/components/artifact-tray.tsx +++ b/chat-ui/src/components/artifact-tray.tsx @@ -1,5 +1,5 @@ -import type { ChatArtifactView } from "@/lib/chat-artifacts"; import { formatArtifactSize } from "@/lib/chat-artifacts"; +import type { ChatArtifactView } from "@/lib/chat-types"; import { cn } from "@/lib/utils"; import { Copy, ExternalLink, FileText } from "lucide-react"; diff --git a/chat-ui/src/components/message-list.tsx b/chat-ui/src/components/message-list.tsx index f2a6ef2e..d1baf706 100644 --- a/chat-ui/src/components/message-list.tsx +++ b/chat-ui/src/components/message-list.tsx @@ -86,7 +86,11 @@ export function MessageList({ /> {message.role === "assistant" && } {message.runTimeline && ( - + )}
))} diff --git a/chat-ui/src/components/run-activity-row.tsx b/chat-ui/src/components/run-activity-row.tsx index d589c57f..63d355de 100644 --- a/chat-ui/src/components/run-activity-row.tsx +++ b/chat-ui/src/components/run-activity-row.tsx @@ -1,5 +1,5 @@ -import { extractToolArtifacts } from "@/lib/chat-artifacts"; -import type { RunActivityState, SubagentActivity, ToolCallState } from "@/lib/chat-types"; +import { extractToolArtifacts, mergeArtifactViews } from "@/lib/chat-artifacts"; +import type { ChatArtifactView, RunActivityState, SubagentActivity, ToolCallState } from "@/lib/chat-types"; import { cn } from "@/lib/utils"; import { Activity, AlertCircle, CheckCircle2, Clock3, Loader2, Radio, ShieldAlert } from "lucide-react"; import type { LucideIcon } from "lucide-react"; @@ -163,9 +163,11 @@ function subagentMeta(subagent: SubagentActivity): string { export function RunActivityRow({ activity, toolCalls, + artifacts: durableArtifacts = [], }: { activity: RunActivityState; toolCalls: ToolCallState[]; + artifacts?: ChatArtifactView[]; }) { const { Icon, className } = statusIcon(activity); const now = useLiveNow(activity.isActive); @@ -173,7 +175,10 @@ export function RunActivityRow({ const elapsed = formatElapsed(elapsedAt - Date.parse(activity.startedAt)); const facts = useMemo(() => activityFacts(activity, toolCalls), [activity, toolCalls]); const subagents = useMemo(() => sortedSubagents(activity), [activity]); - const artifacts = useMemo(() => extractToolArtifacts(toolCalls), [toolCalls]); + const artifacts = useMemo( + () => mergeArtifactViews(durableArtifacts, extractToolArtifacts(toolCalls)), + [durableArtifacts, toolCalls], + ); return (
diff --git a/chat-ui/src/components/tool-call-card.tsx b/chat-ui/src/components/tool-call-card.tsx index 2e6480ed..daab030a 100644 --- a/chat-ui/src/components/tool-call-card.tsx +++ b/chat-ui/src/components/tool-call-card.tsx @@ -169,7 +169,7 @@ export function ToolCallCard({ tool }: { tool: ToolCallState }) { }, [tool.state]); const isOpen = disclosure.isOpen; - const hasBody = Boolean(output || tool.error || tool.blockReason || inputDetails); + const hasBody = Boolean(output || tool.error || tool.blockReason || inputDetails || tool.fullRef); return (
diff --git a/chat-ui/src/hooks/__tests__/use-chat.test.ts b/chat-ui/src/hooks/__tests__/use-chat.test.ts new file mode 100644 index 00000000..29ad62c4 --- /dev/null +++ b/chat-ui/src/hooks/__tests__/use-chat.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from "vitest"; +import type { SessionDetail } from "../../lib/client"; +import { buildTimelineViewMap } from "../../lib/timeline-view"; + +function detail(overrides: Partial = {}): SessionDetail { + return { + id: "session-1", + title: "Session", + created_at: "2026-05-01T00:00:00.000Z", + updated_at: "2026-05-01T00:00:00.000Z", + last_message_at: "2026-05-01T00:00:00.000Z", + message_count: 1, + total_cost_usd: 0, + pinned: 0, + status: "active", + messages: [], + stream_state: { + max_seq: 4, + latest_terminal_seq: 1, + writer_active: false, + has_incomplete_tail: false, + }, + run_timelines: [ + { + id: "run-1", + session_id: "session-1", + user_message_id: "user-1", + assistant_message_id: null, + start_seq: 1, + end_seq: null, + status: "working", + started_at: "2026-05-01T00:00:00.000Z", + completed_at: null, + current_label: "Using Bash...", + stop_reason: null, + duration_ms: null, + cost_usd: null, + input_tokens: null, + output_tokens: null, + summary: { + schemaVersion: 1, + status: "working", + startSeq: 1, + endSeq: null, + startedAt: "2026-05-01T00:00:00.000Z", + currentLabel: "Using Bash...", + tools: [], + subagents: [], + errors: [], + }, + }, + ], + ...overrides, + }; +} + +describe("buildTimelineViewMap", () => { + it("skips an unassigned working timeline when resume will replay the active run", () => { + const map = buildTimelineViewMap( + detail({ + stream_state: { + max_seq: 4, + latest_terminal_seq: 1, + writer_active: true, + has_incomplete_tail: false, + }, + }), + ); + + expect(map.has("user-1")).toBe(false); + }); + + it("keeps completed user-attached timelines after reload", () => { + const base = detail(); + const timeline = base.run_timelines?.[0]; + if (!timeline) throw new Error("missing test timeline"); + const map = buildTimelineViewMap({ + ...base, + run_timelines: [ + { + ...timeline, + end_seq: 6, + status: "completed", + completed_at: "2026-05-01T00:00:03.000Z", + summary: { + ...timeline.summary, + status: "completed", + endSeq: 6, + completedAt: "2026-05-01T00:00:03.000Z", + }, + }, + ], + }); + + expect(map.get("user-1")?.activity.status).toBe("completed"); + }); +}); diff --git a/chat-ui/src/hooks/use-chat.ts b/chat-ui/src/hooks/use-chat.ts index 54d143f0..34a2a320 100644 --- a/chat-ui/src/hooks/use-chat.ts +++ b/chat-ui/src/hooks/use-chat.ts @@ -1,6 +1,6 @@ -import { runTimelineSummaryToView } from "@/lib/chat-activity"; import { parseMessageContentJson } from "@/lib/chat-message-content"; import { type ChatStore, beginRunActivity, createChatStore, dispatchFrame } from "@/lib/chat-store"; +import { buildTimelineViewMap } from "@/lib/timeline-view"; import type { ChatMessage, RunActivityState, @@ -256,15 +256,6 @@ export function useChat(sessionId: string | null): { }; } -function buildTimelineViewMap(detail: SessionDetail): Map { - const map = new Map(); - for (const timeline of detail.run_timelines ?? []) { - const key = timeline.assistant_message_id ?? timeline.user_message_id; - map.set(key, runTimelineSummaryToView(timeline.summary)); - } - return map; -} - function messageRowToChatMessage(row: SessionDetail["messages"][number], runTimeline?: RunTimelineView): ChatMessage { const parsed = parseMessageContentJson(row.content_json, row.role); diff --git a/chat-ui/src/lib/__tests__/chat-activity.test.ts b/chat-ui/src/lib/__tests__/chat-activity.test.ts new file mode 100644 index 00000000..611d9b0b --- /dev/null +++ b/chat-ui/src/lib/__tests__/chat-activity.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from "vitest"; +import { runTimelineSummaryToView } from "../chat-activity"; +import type { DurableRunTimelineSummary } from "../chat-types"; + +describe("runTimelineSummaryToView", () => { + it("preserves durable artifacts for reloaded timelines", () => { + const summary: DurableRunTimelineSummary = { + schemaVersion: 1, + status: "completed", + startSeq: 1, + endSeq: 6, + startedAt: "2026-05-01T00:00:00.000Z", + completedAt: "2026-05-01T00:00:03.000Z", + currentLabel: "Completed.", + artifacts: [ + { + id: "page:/ui/reports/weekly.html", + type: "page", + title: "Weekly Report", + url: "/ui/reports/weekly.html", + path: "reports/weekly.html", + sizeBytes: 8842, + sourceToolName: "phantom_create_page", + }, + ], + tools: [ + { + id: "tool-page", + name: "phantom_create_page", + state: "result", + isMcp: true, + mcpServer: "phantom-web-ui", + safeOutputSummary: "Tool produced output.", + }, + ], + subagents: [], + errors: [], + }; + + const view = runTimelineSummaryToView(summary); + + expect(view.artifacts).toEqual(summary.artifacts); + expect(view.toolCalls[0]?.output).toBe("Tool produced output."); + }); +}); diff --git a/chat-ui/src/lib/__tests__/chat-artifacts.test.ts b/chat-ui/src/lib/__tests__/chat-artifacts.test.ts index 8f693455..44ad27ed 100644 --- a/chat-ui/src/lib/__tests__/chat-artifacts.test.ts +++ b/chat-ui/src/lib/__tests__/chat-artifacts.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import { extractToolArtifacts, formatArtifactSize } from "../chat-artifacts"; +import { extractToolArtifacts, formatArtifactSize, mergeArtifactViews } from "../chat-artifacts"; import type { ToolCallState } from "../chat-types"; function tool(overrides: Partial): ToolCallState { @@ -103,3 +103,18 @@ describe("formatArtifactSize", () => { expect(formatArtifactSize(1024 * 1024 * 2.2)).toBe("2.2 MB"); }); }); + +describe("mergeArtifactViews", () => { + it("deduplicates durable and live artifacts by URL", () => { + const durable = { + id: "page:/ui/profile.html", + type: "page" as const, + title: "Old", + url: "/ui/profile.html", + sourceToolName: "phantom_create_page", + }; + const live = { ...durable, title: "Latest" }; + + expect(mergeArtifactViews([durable], [live])).toEqual([live]); + }); +}); diff --git a/chat-ui/src/lib/chat-activity.ts b/chat-ui/src/lib/chat-activity.ts index 8d3a7ca7..91de3c27 100644 --- a/chat-ui/src/lib/chat-activity.ts +++ b/chat-ui/src/lib/chat-activity.ts @@ -126,7 +126,7 @@ export function runTimelineSummaryToView(summary: DurableRunTimelineSummary): Ru mcpServer: tool.mcpServer, blockReason: tool.blockReason, })); - return { activity, toolCalls }; + return { activity, toolCalls, artifacts: summary.artifacts }; } function runTimelineStatusToActivityStatus(status: DurableRunTimelineSummary["status"]): RunActivityStatus { diff --git a/chat-ui/src/lib/chat-artifacts.ts b/chat-ui/src/lib/chat-artifacts.ts index ab87bf6c..332db2a9 100644 --- a/chat-ui/src/lib/chat-artifacts.ts +++ b/chat-ui/src/lib/chat-artifacts.ts @@ -1,18 +1,8 @@ -import type { ToolCallState } from "./chat-types"; +import type { ChatArtifactView, ToolCallState } from "./chat-types"; const PAGE_TOOL_NAMES = new Set(["phantom_create_page", "phantom_preview_page"]); const MAX_TITLE_LENGTH = 90; -export type ChatArtifactView = { - id: string; - type: "page"; - title: string; - url: string; - path?: string; - sizeBytes?: number; - sourceToolName: string; -}; - export function extractToolArtifacts(toolCalls: ToolCallState[]): ChatArtifactView[] { const artifacts = toolCalls .map(pageArtifactFromTool) @@ -25,6 +15,17 @@ export function extractToolArtifacts(toolCalls: ToolCallState[]): ChatArtifactVi return [...byKey.values()]; } +export function mergeArtifactViews(...groups: ChatArtifactView[][]): ChatArtifactView[] { + const byKey = new Map(); + for (const group of groups) { + for (const artifact of group) { + const key = artifact.url || artifact.path || artifact.id; + byKey.set(key, artifact); + } + } + return [...byKey.values()]; +} + export function formatArtifactSize(sizeBytes: number | undefined): string | null { if (sizeBytes === undefined) return null; if (sizeBytes < 1024) return `${sizeBytes} B`; diff --git a/chat-ui/src/lib/chat-types.ts b/chat-ui/src/lib/chat-types.ts index 6e9b7488..3924deaf 100644 --- a/chat-ui/src/lib/chat-types.ts +++ b/chat-ui/src/lib/chat-types.ts @@ -59,6 +59,16 @@ export type ToolCallState = { blockReason?: string; }; +export type ChatArtifactView = { + id: string; + type: "page"; + title: string; + url: string; + path?: string; + sizeBytes?: number; + sourceToolName: string; +}; + export type ThinkingBlockState = { messageId: string; text: string; @@ -147,6 +157,7 @@ export type DurableRunTimelineSummary = { rateLimit?: RateLimitActivity; mcpServers?: Array<{ name: string; status: string }>; truncatedBacklog?: { olderThanSeq: number; reason: string }; + artifacts?: ChatArtifactView[]; tools: Array<{ id: string; name: string; @@ -171,6 +182,7 @@ export type DurableRunTimelineSummary = { export type RunTimelineView = { activity: RunActivityState; toolCalls: ToolCallState[]; + artifacts?: ChatArtifactView[]; }; export type ChatState = { diff --git a/chat-ui/src/lib/timeline-view.ts b/chat-ui/src/lib/timeline-view.ts new file mode 100644 index 00000000..23919f20 --- /dev/null +++ b/chat-ui/src/lib/timeline-view.ts @@ -0,0 +1,17 @@ +import { runTimelineSummaryToView } from "./chat-activity"; +import type { SessionDetail } from "./client"; +import type { RunTimelineView } from "./chat-types"; + +export function buildTimelineViewMap(detail: SessionDetail): Map { + const map = new Map(); + const resumeWillOwnActiveRun = + detail.stream_state?.writer_active === true || detail.stream_state?.has_incomplete_tail === true; + for (const timeline of detail.run_timelines ?? []) { + if (resumeWillOwnActiveRun && timeline.assistant_message_id === null && timeline.summary.status === "working") { + continue; + } + const key = timeline.assistant_message_id ?? timeline.user_message_id; + map.set(key, runTimelineSummaryToView(timeline.summary)); + } + return map; +} diff --git a/prompts/phase-10m-artifact-memory-architecture.md b/prompts/phase-10m-artifact-memory-architecture.md new file mode 100644 index 00000000..c4804dbf --- /dev/null +++ b/prompts/phase-10m-artifact-memory-architecture.md @@ -0,0 +1,95 @@ +ultrathink. ultrathink. ultrathink. + +# Phase 10M Architecture Prompt: Artifacts, User Memory, Agent Continuity + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. This is read-only architecture work. +Do not edit files. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No +"v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Context + +Cheema wants Phantom on Murph to handle long-running sessions like a serious +agent product. After compaction, the agent should continue smoothly. It should +not forget created pages, confuse magic-login links with artifacts, lose file +paths, or make the user repeat prior context. + +He also raised an architectural split: + +- User memory: durable user-visible facts and preferences. +- Agent continuity: operational facts for the current or recent runs, such as + created files, page URLs, task status, failures, and transcript-search + pointers. +- Historical transcript: searchable prior chat records for recovery after + compaction. + +Your job is to decide what should exist in Phantom versus Murph, what should +be MCP tools versus built-in server state, and what the next shippable slice is. + +## Required Reading + +Read: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/murph/AGENTS.md` +4. `/Users/truffle/work/murph/VISION.md` +5. `/Users/truffle/work/murph/ARCHITECTURE.md` +6. `/Users/truffle/work/murph/PROGRESS.md` +7. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +8. `/Users/truffle/work/phantom-murph-hardening/docs/memory.md` +9. `/Users/truffle/work/phantom-murph-hardening/src/memory/system.ts` +10. `/Users/truffle/work/phantom-murph-hardening/src/chat/continuity-context.ts` +11. `/Users/truffle/work/phantom-murph-hardening/src/chat/transcript-search.ts` +12. `/Users/truffle/work/phantom-murph-hardening/src/chat/run-timeline.ts` +13. `/Users/truffle/work/phantom-murph-hardening/src/agent/in-process-reflective-tools.ts` +14. `/Users/truffle/work/phantom-murph-hardening/src/ui/tools.ts` +15. `/Users/truffle/work/phantom-murph-hardening/src/db/schema.ts` +16. Relevant notes under `/Users/truffle/work/notes-main`. + +## Questions + +1. Should created pages/files become a durable `chat_artifacts` table, or can + run timelines plus stream events carry enough for now? +2. If we add artifacts, what exact schema is minimal but future-proof: + type, title, URL, path, MIME, size, preview kind, safety metadata, + retention, source tool, source message, source run? +3. Should artifacts be exposed to the agent through an MCP tool, injected + continuity context, or both? +4. How should user memory stay separate from operational run continuity? +5. How should historical transcript search be presented to the agent after + compaction without raw replay bloat? +6. What is the smallest next slice that improves real post-compaction behavior + and user-visible UI without prematurely building a full artifact system? + +## Acceptance Criteria + +- Cite file paths and line references. +- Make a clear recommendation. +- Include tradeoffs for MCP tool versus built-in server state versus UI-only. +- Identify migration and testing requirements if a schema change is proposed. +- Explicitly state what belongs in Murph and what belongs in Phantom. + +## Anti-Patterns + +- Do not put Phantom-specific artifact logic into Murph. +- Do not make TypeScript reason about user intent when an agent prompt/tool + can do it better. +- Do not store secrets, magic-login tokens, raw screenshots, or giant payloads + as continuity facts. +- Do not replace transcript search with raw transcript injection. + +## Final Output + +Return: + +1. Recommended architecture. +2. Minimal next slice. +3. Schema or no-schema decision. +4. Agent-facing tool/context decision. +5. Test plan. +6. Risks and non-goals. + +ultrathink. ultrathink. ultrathink. diff --git a/prompts/phase-10m-chat-ui-polish-review.md b/prompts/phase-10m-chat-ui-polish-review.md new file mode 100644 index 00000000..1f86d2fb --- /dev/null +++ b/prompts/phase-10m-chat-ui-polish-review.md @@ -0,0 +1,86 @@ +ultrathink. ultrathink. ultrathink. + +# Phase 10M Review Prompt: Phantom Chat UI Polish + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are reviewing Phantom's Murph chat +experience for production-grade quality. This is read-only. Do not edit files. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No +"v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Product Bar + +Cheema wants Phantom chat to feel best in class, comparable in craft to the +best coding-agent chat products. Every small detail matters: running state, +tool cards, collapsed defaults, thinking/progress, durable run timeline, +artifacts, markdown, file previews, mobile layout, reload recovery, and local +build ergonomics. + +We are not building a marketing page. The chat is the product. + +## Required Reading + +Read: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +4. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10d-chat-ui-polish-research.md` +5. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10b-durable-timeline-research.md` +6. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10h-product-direction.md` +7. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10k-page-artifact-ui.md` +8. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/tool-call-card.tsx` +9. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/run-activity-row.tsx` +10. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/message-list.tsx` +11. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/assistant-message.tsx` +12. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/components/chat-input.tsx` +13. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/chat-store.ts` +14. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/lib/tool-disclosure.ts` +15. `/Users/truffle/work/phantom-murph-hardening/chat-ui/src/index.css` + +Also inspect the latest PR branch diff against `origin/main`. + +## Review Questions + +1. What UI flaws remain after the current PR, especially ones visible in real + long-running Murph tasks? +2. Are completed tool cards collapsed by default while still making evidence + easy to inspect? +3. Does the run timeline explain activity during long tool calls, compaction, + reload/replay, and post-run completion? +4. Are artifact cards useful enough, or should the next slice add richer + previews, persistent artifact lists, or copy/open affordances? +5. Are there layout issues on desktop or mobile: borders, scroll behavior, + truncation, nested cards, empty states, input footer, or typography? +6. What automated or Playwright visual tests should be added before this UI is + considered shippable? + +## Acceptance Criteria + +- Findings must cite real files and ideally lines. +- Separate correctness bugs from product polish. +- Recommend a small next implementation slice that can be built and visually + verified in one PR. +- Do not ask for broad redesign for its own sake. The next slice must improve + real task usability. + +## Anti-Patterns + +- Do not focus only on colors. +- Do not propose hiding tool details for safety. The user owns the agent and + needs inspectability, with reasonable redaction for secrets. +- Do not introduce decorative UI that competes with the work. +- Do not recommend a huge rewrite unless you can name a specific blocker that + makes incremental work worse. + +## Final Output + +Return: + +1. Top five UI issues ranked by impact. +2. Evidence with file paths. +3. A proposed next slice with files, tests, and visual checks. +4. Things that should wait. + +ultrathink. ultrathink. ultrathink. diff --git a/prompts/phase-10m-pi-murph-continuity-research.md b/prompts/phase-10m-pi-murph-continuity-research.md new file mode 100644 index 00000000..8081f14f --- /dev/null +++ b/prompts/phase-10m-pi-murph-continuity-research.md @@ -0,0 +1,93 @@ +ultrathink. ultrathink. ultrathink. + +# Phase 10M Research Prompt: Pi/Murph Continuity Reuse + +You are a principal engineer, principal architect, and principal product +manager at Anthropic, all three at once. You are working for Cheema on Phantom +running on Murph. Your job is read-only research. Do not edit files. + +No context anxiety. No token limits. No time pressure. No cost anxiety. No +"v2" thinking. There is no v2. Build it right. Take as long as you need. + +## Goal + +Determine whether Phantom or Murph is reinventing continuity, compaction, +runtime progress, or transcript recovery behavior that Pi, Pi Code, pi-mono, +OpenClaw, opencode, Shelley, or local notes already handle better. + +Focus on the post-compaction continuation failure mode Cheema observed: +the agent completed a long task, compaction happened, then the user asked for a +created page link and the agent either confused the page with a login link or +stalled. Murph main now has a stronger compaction continuation prompt and +Phantom PR #113 has live context injection plus transcript search. Verify if +that architecture is still the right direction. + +## Required Reading + +Read real files, not summaries: + +1. `/Users/truffle/.claude/AGENTS.md` +2. `/Users/truffle/.claude/CLAUDE.md` +3. `/Users/truffle/work/murph/AGENTS.md` +4. `/Users/truffle/work/murph/QUALITY-BAR.md` +5. `/Users/truffle/work/murph/PROGRESS.md` +6. `/Users/truffle/work/phantom-murph-hardening/CLAUDE.md` +7. `/Users/truffle/work/phantom-murph-hardening/research/chat-experience/phase-10l-continuity-hardening.md` +8. `/Users/truffle/work/phantom-murph-hardening/src/agent/chat-query.ts` +9. `/Users/truffle/work/phantom-murph-hardening/src/agent/murph-context.ts` +10. `/Users/truffle/work/phantom-murph-hardening/src/chat/continuity-context.ts` +11. `/Users/truffle/work/phantom-murph-hardening/src/agent/in-process-reflective-tools.ts` +12. `/Users/truffle/work/murph/packages/core/src/compaction/summary-prompt.ts` +13. `/Users/truffle/work/murph/packages/core/src/query/query.ts` +14. `/Users/truffle/work/murph/packages/core/src/substrate/pi-adapter.ts` +15. `/Users/truffle/work/murph/packages/core/src/substrate/pi-summary.ts` +16. Relevant files in `/Users/truffle/work/pi-mono`, `/Users/truffle/work/research-clones/pi-mono-latest`, `/Users/truffle/work/openclaw`, `/Users/truffle/work/opencode`, `/Users/truffle/work/shelley`, and `/Users/truffle/work/notes-main`. + +Use `rg` aggressively. If a referenced repo lacks the expected files, say so +and continue with the repos that exist. + +## Questions To Answer + +1. Does Pi or pi-mono already expose a better primitive for context transforms, + compaction, memory pointers, or transcript recovery than the current Murph + implementation uses? +2. Is Phantom using Murph's Pi-backed `transformContext` in the least + invasive, provider-correct way? +3. Should user-facing memory and agent-facing continuity remain separate, or + should either be folded into the same transcript recovery tool? +4. Should Phantom create durable first-class artifact rows, or is stream-log + extraction enough for the next PR? +5. Are there protocol risks around tool-call replay, compact summaries, or + tool-result summarization that could break OpenAI, Anthropic, or ZAI after + compaction? +6. What is the next smallest testable slice that improves correctness without + adding a Phantom-only shortcut? + +## Acceptance Criteria + +- Ground every claim in file paths and line references. +- Distinguish "confirmed from code" from "inferred product recommendation." +- Identify P0/P1/P2 risks if any. +- Recommend one next implementation slice and one explicit non-goal. +- Do not propose copying implementation from another repo. Clean-room lessons + only. + +## Anti-Patterns + +- Do not say "Pi probably handles this" without a file path. +- Do not recommend a parallel Phantom memory runtime. +- Do not weaken provider tool-call protocol correctness. +- Do not suggest sending raw historical tool JSON back to providers. +- Do not optimize for a lucky demo. This is production chat continuity. + +## Final Output + +Return: + +1. Executive verdict, 3 to 6 bullets. +2. Evidence table with file paths and concise findings. +3. Risks ranked P0/P1/P2/P3. +4. Recommended next slice with exact files likely touched. +5. Non-goals and why. + +ultrathink. ultrathink. ultrathink. diff --git a/src/chat/__tests__/continuity-context.test.ts b/src/chat/__tests__/continuity-context.test.ts index 1ea095a9..63297226 100644 --- a/src/chat/__tests__/continuity-context.test.ts +++ b/src/chat/__tests__/continuity-context.test.ts @@ -3,11 +3,13 @@ import { afterEach, beforeEach, describe, expect, test } from "bun:test"; import { MIGRATIONS } from "../../db/schema.ts"; import { buildChatContinuityContext } from "../continuity-context.ts"; import { ChatEventLog } from "../event-log.ts"; +import { ChatRunTimelineStore } from "../run-timeline.ts"; import { ChatSessionStore } from "../session-store.ts"; let db: Database; let eventLog: ChatEventLog; let sessionStore: ChatSessionStore; +let timelineStore: ChatRunTimelineStore; beforeEach(() => { db = new Database(":memory:"); @@ -16,6 +18,7 @@ beforeEach(() => { } eventLog = new ChatEventLog(db); sessionStore = new ChatSessionStore(db); + timelineStore = new ChatRunTimelineStore(db); }); afterEach(() => { @@ -143,4 +146,53 @@ describe("buildChatContinuityContext", () => { expect(context).toContain("phantom_chat_transcript_search"); expect(context).toContain("Authentication links"); }); + + test("uses persisted timeline artifacts after stream events are unavailable", () => { + const session = sessionStore.create(); + db.run( + `INSERT INTO chat_messages (id, session_id, seq, role, content_json) + VALUES ('user-1', ?, 1, 'user', '"create page"')`, + [session.id], + ); + timelineStore.upsert({ + id: "run-1", + sessionId: session.id, + userMessageId: "user-1", + startSeq: 1, + status: "completed", + startedAt: "2026-05-01T00:00:00.000Z", + completedAt: "2026-05-01T00:00:03.000Z", + currentLabel: "Completed.", + summary: { + schemaVersion: 1, + status: "completed", + startSeq: 1, + endSeq: 4, + startedAt: "2026-05-01T00:00:00.000Z", + completedAt: "2026-05-01T00:00:03.000Z", + currentLabel: "Completed.", + artifacts: [ + { + id: "page:/ui/reports/weekly.html", + type: "page", + title: "Weekly Report", + url: "/ui/reports/weekly.html", + path: "reports/weekly.html", + sizeBytes: 8842, + sourceToolName: "phantom_create_page", + }, + ], + tools: [], + subagents: [], + errors: [], + }, + }); + + const context = buildChatContinuityContext({ sessionId: session.id, eventLog, timelineStore }); + + expect(context).toContain("Weekly Report"); + expect(context).toContain("/ui/reports/weekly.html"); + expect(context).toContain("reports/weekly.html"); + expect(context).toContain("via phantom_create_page"); + }); }); diff --git a/src/chat/__tests__/run-timeline.test.ts b/src/chat/__tests__/run-timeline.test.ts index ef2db581..794dc9aa 100644 --- a/src/chat/__tests__/run-timeline.test.ts +++ b/src/chat/__tests__/run-timeline.test.ts @@ -224,6 +224,91 @@ describe("DurableRunTimelineBuilder", () => { expect(summary.subagents[0]).toMatchObject({ taskId: "task-1", status: "completed", summary: "Done" }); }); + test("persists safe page artifacts without relying on full tool output", () => { + const builder = startBuilder(); + builder.apply( + { + event: "message.tool_call_start", + message_id: "assistant-1", + tool_call_id: "tool-page", + tool_name: "mcp__phantom-web-ui__phantom_create_page", + parent_tool_use_id: null, + is_mcp: true, + mcp_server: "phantom-web-ui", + }, + 2, + ); + builder.apply( + { + event: "message.tool_call_input_end", + tool_call_id: "tool-page", + input: { path: "reports/weekly.html", title: "Weekly Report" }, + }, + 3, + ); + builder.apply( + { + event: "message.tool_call_result", + tool_call_id: "tool-page", + tool_name: "mcp__phantom-web-ui__phantom_create_page", + status: "success", + duration_ms: 120, + output: JSON.stringify({ + created: true, + path: "reports/weekly.html", + url: "http://127.0.0.1:3112/ui/reports/weekly.html", + size: 8842, + }), + }, + 4, + ); + + const summary = builder.toUpsertParams().summary; + expect(summary.tools[0]?.safeOutputSummary).toBe("Tool produced output."); + expect(summary.artifacts).toEqual([ + { + id: "page:http://127.0.0.1:3112/ui/reports/weekly.html", + type: "page", + title: "Weekly Report", + url: "http://127.0.0.1:3112/ui/reports/weekly.html", + path: "reports/weekly.html", + sizeBytes: 8842, + sourceToolName: "phantom_create_page", + }, + ]); + }); + + test("keeps magic login links out of durable artifacts", () => { + const builder = startBuilder(); + builder.apply( + { + event: "message.tool_call_start", + message_id: "assistant-1", + tool_call_id: "tool-login", + tool_name: "mcp__phantom-web-ui__phantom_generate_login", + parent_tool_use_id: null, + is_mcp: true, + mcp_server: "phantom-web-ui", + }, + 2, + ); + builder.apply( + { + event: "message.tool_call_result", + tool_call_id: "tool-login", + tool_name: "mcp__phantom-web-ui__phantom_generate_login", + status: "success", + output: JSON.stringify({ + magicLink: "http://127.0.0.1:3112/ui/login?magic=secret", + expiresIn: "10 minutes", + }), + }, + 3, + ); + + expect(builder.toUpsertParams().summary.artifacts).toEqual([]); + }); + test("records session done and session error terminal state", () => { const doneBuilder = startBuilder(); doneBuilder.apply( diff --git a/src/chat/continuity-context.ts b/src/chat/continuity-context.ts index 4b0a6274..854a0175 100644 --- a/src/chat/continuity-context.ts +++ b/src/chat/continuity-context.ts @@ -1,4 +1,5 @@ import type { ChatEventLog, ChatStreamEvent } from "./event-log.ts"; +import type { ChatRunTimelineStore, DurableRunTimelineArtifactSummary } from "./run-timeline.ts"; const DEFAULT_EVENT_SCAN_LIMIT = 5000; const MAX_ARTIFACTS = 8; @@ -9,6 +10,7 @@ const PAGE_TOOLS = new Set(["phantom_create_page", "phantom_preview_page"]); type BuildChatContinuityContextInput = { sessionId: string; eventLog: ChatEventLog; + timelineStore?: ChatRunTimelineStore; limit?: number; }; @@ -21,7 +23,7 @@ type ToolAccumulator = { }; type PageArtifact = { - seq: number; + seq?: number; toolName: string; label: string; url?: string; @@ -76,7 +78,9 @@ export function buildChatContinuityContext(input: BuildChatContinuityContextInpu tools.set(toolCallId, tool); } - const artifacts = dedupeArtifacts([...tools.values()].flatMap((tool) => artifactFromTool(tool) ?? [])); + const timelineArtifacts = artifactsFromTimeline(input.timelineStore, input.sessionId); + const eventArtifacts = [...tools.values()].flatMap((tool) => artifactFromTool(tool) ?? []); + const artifacts = dedupeArtifacts([...timelineArtifacts, ...eventArtifacts]); const latestCompactions = compactions.slice(-MAX_COMPACTIONS); return renderContext({ @@ -118,7 +122,7 @@ function renderContext(input: { if (artifact.url) parts.push(` URL: ${artifact.url}`); if (artifact.path) parts.push(` path: ${artifact.path}`); if (artifact.size !== undefined) parts.push(` size: ${artifact.size} bytes`); - parts.push(` via ${artifact.toolName} at stream seq ${artifact.seq}.`); + parts.push(` via ${artifact.toolName}${artifact.seq === undefined ? "" : ` at stream seq ${artifact.seq}`}.`); lines.push(parts.join(";")); } } @@ -153,6 +157,31 @@ function artifactFromTool(tool: ToolAccumulator): PageArtifact | undefined { }; } +function artifactsFromTimeline(timelineStore: ChatRunTimelineStore | undefined, sessionId: string): PageArtifact[] { + if (!timelineStore) return []; + return timelineStore + .getDetailsBySession(sessionId) + .flatMap((timeline) => timeline.summary.artifacts ?? []) + .map(timelineArtifactFromSummary) + .filter((artifact): artifact is PageArtifact => artifact !== undefined); +} + +function timelineArtifactFromSummary(artifact: DurableRunTimelineArtifactSummary): PageArtifact | undefined { + if (artifact.type !== "page") return undefined; + const toolName = normalizePageToolName(artifact.sourceToolName); + if (!toolName) return undefined; + const path = normalizePagePath(artifact.path); + const url = normalizePageUrl(artifact.url); + if (!url && !path) return undefined; + return { + toolName, + label: truncate(artifact.title, MAX_LABEL_LENGTH), + ...(url ? { url } : {}), + ...(path ? { path } : {}), + ...(artifact.sizeBytes !== undefined ? { size: artifact.sizeBytes } : {}), + }; +} + function normalizePageToolName(toolName: string | undefined): string | undefined { if (!toolName) return undefined; for (const pageToolName of PAGE_TOOLS) { @@ -169,7 +198,7 @@ function dedupeArtifacts(artifacts: PageArtifact[]): PageArtifact[] { const key = artifact.url ?? artifact.path ?? `${artifact.toolName}:${artifact.seq}`; byKey.set(key, artifact); } - return [...byKey.values()].sort((left, right) => left.seq - right.seq); + return [...byKey.values()].sort((left, right) => (left.seq ?? 0) - (right.seq ?? 0)); } function parsePayload(event: ChatStreamEvent): Record | undefined { diff --git a/src/chat/run-timeline.ts b/src/chat/run-timeline.ts index 2d601d70..657e59e3 100644 --- a/src/chat/run-timeline.ts +++ b/src/chat/run-timeline.ts @@ -39,6 +39,16 @@ export type DurableRunTimelineErrorSummary = { message: string; }; +export type DurableRunTimelineArtifactSummary = { + id: string; + type: "page"; + title: string; + url: string; + path?: string; + sizeBytes?: number; + sourceToolName: string; +}; + export type DurableRunTimelineSummary = { schemaVersion: 1; status: RunTimelineStatus; @@ -61,6 +71,7 @@ export type DurableRunTimelineSummary = { }; mcpServers?: Array<{ name: string; status: string }>; truncatedBacklog?: { olderThanSeq: number; reason: string }; + artifacts?: DurableRunTimelineArtifactSummary[]; tools: DurableRunTimelineToolSummary[]; subagents: DurableRunTimelineSubagentSummary[]; errors: DurableRunTimelineErrorSummary[]; @@ -129,6 +140,15 @@ const MAX_SUMMARY_TEXT = 240; const MAX_OUTPUT_SUMMARY_TEXT = 360; const MAX_COLLECTION_ITEMS = 25; const MAX_INPUT_PARTS = 3; +const PAGE_TOOL_NAMES = ["phantom_create_page", "phantom_preview_page"] as const; +const MAX_ARTIFACT_TITLE = 90; + +type PageToolName = (typeof PAGE_TOOL_NAMES)[number]; + +type PageArtifactInput = { + path?: string; + title?: string; +}; function isTerminalToolState(state: RunTimelineToolState): boolean { return state === "result" || state === "error" || state === "blocked" || state === "aborted"; @@ -214,6 +234,7 @@ export class DurableRunTimelineBuilder { private readonly sessionId: string; private readonly userMessageId: string; private assistantMessageId: string | null = null; + private readonly artifactInputs = new Map(); private summary: DurableRunTimelineSummary; private constructor(params: RunTimelineStartParams) { @@ -227,6 +248,7 @@ export class DurableRunTimelineBuilder { endSeq: null, startedAt: params.startedAt, currentLabel: "Working...", + artifacts: [], tools: [], subagents: [], errors: [], @@ -272,6 +294,7 @@ export class DurableRunTimelineBuilder { tool.state = "running"; } tool.safeInputSummary = summarizeToolInput(frame.input); + this.captureArtifactInput(tool, frame.tool_call_id, frame.input); this.summary.currentLabel = `Prepared ${tool.name}.`; return true; } @@ -304,6 +327,7 @@ export class DurableRunTimelineBuilder { safeText(frame.output_preview, MAX_OUTPUT_SUMMARY_TEXT) ?? summarizeToolOutput(frame.status, frame.output); tool.outputTruncated = frame.output_truncated === true || isTruncated(frame.output_preview ?? frame.output, MAX_OUTPUT_SUMMARY_TEXT); + this.captureArtifactResult(tool, frame.tool_call_id, frame.output ?? frame.output_preview); this.summary.currentLabel = frame.status === "error" ? `${tool.name} failed.` : `${tool.name} completed.`; return true; } @@ -453,6 +477,7 @@ export class DurableRunTimelineBuilder { tools: this.summary.tools.map((tool) => ({ ...tool })), subagents: this.summary.subagents.map((subagent) => ({ ...subagent })), errors: this.summary.errors.map((error) => ({ ...error })), + artifacts: this.summary.artifacts?.map((artifact) => ({ ...artifact })), mcpServers: this.summary.mcpServers?.map((server) => ({ ...server })), }; } @@ -511,6 +536,57 @@ export class DurableRunTimelineBuilder { } return existing; } + + private captureArtifactInput(tool: DurableRunTimelineToolSummary, toolCallId: string, input: unknown): void { + if (!normalizePageToolName(tool.name)) return; + const record = recordFromUnknown(input); + if (!record) return; + const path = normalizePagePath(stringField(record, "path")); + const title = stringField(record, "title"); + if (path || title) { + this.artifactInputs.set(toolCallId, { + ...(path ? { path } : {}), + ...(title ? { title: truncate(title, MAX_ARTIFACT_TITLE) } : {}), + }); + } + } + + private captureArtifactResult( + tool: DurableRunTimelineToolSummary, + toolCallId: string, + output: string | undefined, + ): void { + const sourceToolName = normalizePageToolName(tool.name); + if (!sourceToolName || tool.state !== "result") return; + const outputRecord = parseJsonRecord(output); + const input = this.artifactInputs.get(toolCallId); + const path = normalizePagePath(stringField(outputRecord, "path") ?? input?.path); + const url = + normalizePageUrl( + stringField(outputRecord, "url") ?? + stringField(outputRecord, "publicUrl") ?? + stringField(outputRecord, "pageUrl") ?? + urlFromText(output), + ) ?? urlFromPath(path); + if (!url) return; + const title = truncate( + stringField(outputRecord, "title") ?? input?.title ?? path ?? "Created page", + MAX_ARTIFACT_TITLE, + ); + const sizeBytes = numberField(outputRecord, "size"); + const artifact: DurableRunTimelineArtifactSummary = { + id: `page:${url}`, + type: "page", + title, + url, + sourceToolName, + ...(path ? { path } : {}), + ...(sizeBytes !== undefined ? { sizeBytes } : {}), + }; + const byId = new Map((this.summary.artifacts ?? []).map((current) => [current.id, current])); + byId.set(artifact.id, artifact); + this.summary.artifacts = [...byId.values()].slice(-MAX_COLLECTION_ITEMS); + } } export function runTimelineRowToDetail(row: ChatRunTimelineRow): ChatRunTimelineDetail { @@ -554,6 +630,7 @@ function parseRunTimelineSummary(summaryJson: string, row: ChatRunTimelineRow): costUsd: row.cost_usd ?? undefined, inputTokens: row.input_tokens ?? undefined, outputTokens: row.output_tokens ?? undefined, + artifacts: [], tools: [], subagents: [], errors: [], @@ -633,6 +710,80 @@ function summarizeToolOutput(status: "success" | "error", output: string | undef return "Tool produced output."; } +function normalizePageToolName(toolName: string | undefined): PageToolName | undefined { + if (!toolName) return undefined; + for (const pageToolName of PAGE_TOOL_NAMES) { + if (toolName === pageToolName || toolName.endsWith(`__${pageToolName}`) || toolName.endsWith(`:${pageToolName}`)) { + return pageToolName; + } + } + return undefined; +} + +function parseJsonRecord(value: string | undefined): Record | undefined { + if (!value) return undefined; + try { + return recordFromUnknown(JSON.parse(value)); + } catch { + return undefined; + } +} + +function recordFromUnknown(value: unknown): Record | undefined { + if (!isObject(value)) return undefined; + return value; +} + +function stringField(record: Record | undefined, key: string): string | undefined { + const value = record?.[key]; + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function numberField(record: Record | undefined, key: string): number | undefined { + const value = record?.[key]; + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function normalizePageUrl(value: string | undefined): string | undefined { + if (!value) return undefined; + const trimmed = stripTrailingPunctuation(value.trim()); + if (!trimmed.includes("/ui/")) return undefined; + if (trimmed.includes("/ui/login") || trimmed.includes("magic=") || hasSensitiveQuery(trimmed)) return undefined; + return trimmed; +} + +function normalizePagePath(value: string | undefined): string | undefined { + if (!value) return undefined; + const cleaned = value.trim().replace(/^\/+/, "").replace(/^ui\//, ""); + if (!cleaned || cleaned.includes("..") || cleaned.includes("\0") || cleaned.startsWith("login")) return undefined; + return cleaned; +} + +function urlFromPath(path: string | undefined): string | undefined { + return path ? `/ui/${path}` : undefined; +} + +function urlFromText(value: string | undefined): string | undefined { + if (!value) return undefined; + const match = value.match(/(?:https?:\/\/[^\s"']*\/ui\/[^\s"']+|\/ui\/[^\s"']+)/); + return normalizePageUrl(match?.[0]); +} + +function stripTrailingPunctuation(value: string): string { + return value.replace(/[),.;]+$/g, ""); +} + +function hasSensitiveQuery(value: string): boolean { + return /[?&](?:api[_-]?key|token|secret|password|access_token|code|magic)=/i.test(value); +} + +function truncate(value: string, maxLength: number): string { + if (value.length <= maxLength) return value; + return `${value.slice(0, maxLength - 3)}...`; +} + function summarizeCommand(command: string): string | undefined { const redacted = redact(command).trim(); if (redacted.length === 0) return undefined; diff --git a/src/chat/writer.ts b/src/chat/writer.ts index dc0a44e8..1df0062e 100644 --- a/src/chat/writer.ts +++ b/src/chat/writer.ts @@ -126,6 +126,7 @@ export class ChatSessionWriter { buildChatContinuityContext({ sessionId: this.deps.sessionId, eventLog: this.deps.eventLog, + timelineStore: this.deps.timelineStore, }); const sessionContext = sessionContextProvider(); const response = await this.deps.runtime.runForChat(sessionKey, message, {