From af0c4381d9a4e42d3c5cabdb9ec9fb41c28e47c2 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 6 Apr 2026 01:30:19 +0000 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20transcript=20import=20pipeline=20?= =?UTF-8?q?=E2=80=94=20grade=20existing=20Claude/Codex/Copilot=20sessions?= =?UTF-8?q?=20offline=20(#872)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `agentv import` command with Claude, Codex, and Copilot subcommands that read existing AI coding sessions from disk and normalize them into a tool-agnostic transcript JSONL format. Add `--transcript` flag to `agentv eval` that skips provider invocation and grades pre-recorded transcripts, enabling offline evaluation without re-running sessions. Rename `agentv trace` → `agentv inspect` (kept trace as deprecated alias). Key changes: - New parsers: codex-parser.ts, transcript-provider.ts - New discovery: codex-session-discovery.ts - Updated import output to spec format (input, output, source, token_usage, etc.) - TranscriptProvider implements Provider interface for eval pipeline integration - Re-export copilot parser/discovery from import barrel for CLI access Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/commands/run.ts | 7 + apps/cli/src/commands/eval/run-eval.ts | 156 +++-- apps/cli/src/commands/import/claude.ts | 13 +- apps/cli/src/commands/import/codex.ts | 127 ++++ apps/cli/src/commands/import/copilot.ts | 158 +++++ apps/cli/src/commands/import/index.ts | 4 + apps/cli/src/commands/inspect/index.ts | 17 + apps/cli/src/commands/inspect/list.ts | 93 +++ apps/cli/src/commands/inspect/score.ts | 401 +++++++++++ apps/cli/src/commands/inspect/show.ts | 362 ++++++++++ apps/cli/src/commands/inspect/stats.ts | 261 +++++++ apps/cli/src/commands/inspect/utils.ts | 646 ++++++++++++++++++ apps/cli/src/index.ts | 5 +- .../core/src/evaluation/providers/targets.ts | 3 +- .../core/src/evaluation/providers/types.ts | 4 +- packages/core/src/import/codex-parser.ts | 238 +++++++ .../src/import/codex-session-discovery.ts | 113 +++ packages/core/src/import/index.ts | 28 +- .../core/src/import/transcript-provider.ts | 75 ++ packages/core/src/import/types.ts | 91 ++- 20 files changed, 2738 insertions(+), 64 deletions(-) create mode 100644 apps/cli/src/commands/import/codex.ts create mode 100644 apps/cli/src/commands/import/copilot.ts create mode 100644 apps/cli/src/commands/inspect/index.ts create mode 100644 apps/cli/src/commands/inspect/list.ts create mode 100644 apps/cli/src/commands/inspect/score.ts create mode 100644 apps/cli/src/commands/inspect/show.ts create mode 100644 apps/cli/src/commands/inspect/stats.ts create mode 100644 apps/cli/src/commands/inspect/utils.ts create mode 100644 packages/core/src/import/codex-parser.ts create mode 100644 packages/core/src/import/codex-session-discovery.ts create mode 100644 packages/core/src/import/transcript-provider.ts diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 38efa28ad..d46665abe 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -192,6 +192,12 @@ export const evalRunCommand = command({ long: 'exclude-tag', description: 'Skip eval files that have this tag (repeatable, file skipped if any match)', }), + transcript: option({ + type: optional(string), + long: 'transcript', + description: + 'Grade a pre-recorded transcript JSONL instead of invoking a live provider. Ignores targets.', + }), }, handler: async (args) => { // Launch interactive wizard when no eval paths and stdin is a TTY @@ -237,6 +243,7 @@ export const evalRunCommand = command({ threshold: args.threshold, tag: args.tag, excludeTag: args.excludeTag, + transcript: args.transcript, }; const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); if (result?.allExecutionErrors) { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 61aab86a6..5b316c6b2 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -90,6 +90,7 @@ interface NormalizedOptions { readonly threshold?: number; readonly tags: readonly string[]; readonly excludeTags: readonly string[]; + readonly transcript?: string; } function normalizeBoolean(value: unknown): boolean { @@ -357,6 +358,7 @@ function normalizeOptions( threshold: normalizeOptionalNumber(rawOptions.threshold), tags: normalizeStringArray(rawOptions.tag), excludeTags: normalizeStringArray(rawOptions.excludeTag), + transcript: normalizeString(rawOptions.transcript), } satisfies NormalizedOptions; } @@ -507,63 +509,86 @@ async function prepareFileMetadata(params: { category, }); const testIds = suite.tests.map((value) => value.id); - - // Determine target names: CLI --target flags override YAML - const cliTargets = options.cliTargets; const suiteTargets = suite.targets; - // Resolve which target names to use (precedence: CLI > suite YAML targets > default) - let targetNames: readonly string[]; - if (cliTargets.length > 0) { - targetNames = cliTargets; - } else if (suiteTargets && suiteTargets.length > 0) { - targetNames = suiteTargets; - } else { - targetNames = []; - } - let selections: { selection: TargetSelection; inlineTargetLabel: string }[]; - if (targetNames.length > 1) { - // Matrix mode: multiple targets - const multiSelections = await selectMultipleTargets({ - testFilePath, - repoRoot, - cwd, - explicitTargetsPath: options.targetsPath, - dryRun: options.dryRun, - dryRunDelay: options.dryRunDelay, - dryRunDelayMin: options.dryRunDelayMin, - dryRunDelayMax: options.dryRunDelayMax, - env: process.env, - targetNames, - }); - - selections = multiSelections.map((sel) => ({ - selection: sel, - inlineTargetLabel: sel.targetName, - })); - } else { - // Single target mode (legacy path) - const selection = await selectTarget({ - testFilePath, - repoRoot, - cwd, - explicitTargetsPath: options.targetsPath, - cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target, - dryRun: options.dryRun, - dryRunDelay: options.dryRunDelay, - dryRunDelayMin: options.dryRunDelayMin, - dryRunDelayMax: options.dryRunDelayMax, - env: process.env, - }); - + if (options.transcript) { + // --transcript mode: bypass target resolution entirely. + // Create a synthetic TargetSelection for the transcript provider. + const transcriptSelection: TargetSelection = { + definitions: [], + resolvedTarget: { + kind: 'transcript', + name: 'transcript', + config: {} as Record, + }, + targetName: 'transcript', + targetSource: 'cli', + targetsFilePath: options.transcript, + }; selections = [ { - selection, - inlineTargetLabel: selection.targetName, + selection: transcriptSelection, + inlineTargetLabel: `transcript (${path.basename(options.transcript)})`, }, ]; + } else { + // Determine target names: CLI --target flags override YAML + const cliTargets = options.cliTargets; + const suiteTargets = suite.targets; + + // Resolve which target names to use (precedence: CLI > suite YAML targets > default) + let targetNames: readonly string[]; + if (cliTargets.length > 0) { + targetNames = cliTargets; + } else if (suiteTargets && suiteTargets.length > 0) { + targetNames = suiteTargets; + } else { + targetNames = []; + } + + if (targetNames.length > 1) { + // Matrix mode: multiple targets + const multiSelections = await selectMultipleTargets({ + testFilePath, + repoRoot, + cwd, + explicitTargetsPath: options.targetsPath, + dryRun: options.dryRun, + dryRunDelay: options.dryRunDelay, + dryRunDelayMin: options.dryRunDelayMin, + dryRunDelayMax: options.dryRunDelayMax, + env: process.env, + targetNames, + }); + + selections = multiSelections.map((sel) => ({ + selection: sel, + inlineTargetLabel: sel.targetName, + })); + } else { + // Single target mode (legacy path) + const selection = await selectTarget({ + testFilePath, + repoRoot, + cwd, + explicitTargetsPath: options.targetsPath, + cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target, + dryRun: options.dryRun, + dryRunDelay: options.dryRunDelay, + dryRunDelayMin: options.dryRunDelayMin, + dryRunDelayMax: options.dryRunDelayMax, + env: process.env, + }); + + selections = [ + { + selection, + inlineTargetLabel: selection.targetName, + }, + ]; + } } return { @@ -623,6 +648,9 @@ async function runSingleEvalFile(params: { readonly totalBudgetUsd?: number; readonly failOnError?: FailOnError; readonly threshold?: number; + readonly providerFactory?: ( + target: import('@agentv/core').ResolvedTarget, + ) => import('@agentv/core').Provider; }): Promise<{ results: EvaluationResult[] }> { const { testFilePath, @@ -645,6 +673,7 @@ async function runSingleEvalFile(params: { matrixMode, totalBudgetUsd, failOnError, + providerFactory, } = params; const targetName = selection.targetName; @@ -742,6 +771,7 @@ async function runSingleEvalFile(params: { graderTarget: options.graderTarget, model: options.model, threshold: options.threshold, + providerFactory, streamCallbacks: streamingObserver?.getStreamCallbacks(), onResult: async (result: EvaluationResult) => { ( @@ -1198,6 +1228,31 @@ export async function runEvalCommand( // Use only files that survived tag filtering (fileMetadata keys) const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); + // --transcript: create a shared TranscriptProvider and validate line count + let transcriptProviderFactory: + | ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider) + | undefined; + if (options.transcript) { + const { TranscriptProvider } = await import('@agentv/core'); + const transcriptProvider = await TranscriptProvider.fromFile(options.transcript); + + // Validate: transcript lines must match total test cases across all files + const totalTests = [...fileMetadata.values()].reduce( + (sum, meta) => sum + meta.testCases.length, + 0, + ); + if (transcriptProvider.lineCount !== totalTests) { + throw new Error( + `Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`, + ); + } + + transcriptProviderFactory = () => transcriptProvider; + console.log( + `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`, + ); + } + try { await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => { const targetPrep = fileMetadata.get(testFilePath); @@ -1242,11 +1297,12 @@ export async function runEvalCommand( selection, inlineTargetLabel, testCases: applicableTestCases, - trialsConfig: targetPrep.trialsConfig, + trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, matrixMode: targetPrep.selections.length > 1, totalBudgetUsd: targetPrep.totalBudgetUsd, failOnError: targetPrep.failOnError, threshold: resolvedThreshold, + providerFactory: transcriptProviderFactory, }); return result.results; diff --git a/apps/cli/src/commands/import/claude.ts b/apps/cli/src/commands/import/claude.ts index 5664d1afe..4e75c2f1b 100644 --- a/apps/cli/src/commands/import/claude.ts +++ b/apps/cli/src/commands/import/claude.ts @@ -1,6 +1,11 @@ import { mkdir, writeFile } from 'node:fs/promises'; import path from 'node:path'; -import { discoverClaudeSessions, parseClaudeSession, readTranscriptFile } from '@agentv/core'; +import { + discoverClaudeSessions, + parseClaudeSession, + readTranscriptFile, + toTranscriptJsonLine, +} from '@agentv/core'; import { command, flag, option, optional, string } from 'cmd-ts'; export const importClaudeCommand = command({ @@ -106,9 +111,9 @@ export const importClaudeCommand = command({ // Ensure output directory exists await mkdir(path.dirname(outputPath), { recursive: true }); - // Write transcript as JSONL (one message per line) - const outputLines = transcript.messages.map((msg) => JSON.stringify(msg)); - await writeFile(outputPath, `${outputLines.join('\n')}\n`, 'utf8'); + // Write transcript as JSONL (one line per test case, snake_case wire format) + const jsonLine = toTranscriptJsonLine(transcript); + await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8'); const msgCount = transcript.messages.length; const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0); diff --git a/apps/cli/src/commands/import/codex.ts b/apps/cli/src/commands/import/codex.ts new file mode 100644 index 000000000..a99035b1d --- /dev/null +++ b/apps/cli/src/commands/import/codex.ts @@ -0,0 +1,127 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { + discoverCodexSessions, + parseCodexSession, + readTranscriptFile, + toTranscriptJsonLine, +} from '@agentv/core'; +import { command, flag, option, optional, string } from 'cmd-ts'; + +export const importCodexCommand = command({ + name: 'codex', + description: 'Import a Codex CLI session transcript for offline grading', + args: { + discover: option({ + type: optional(string), + long: 'discover', + description: 'Discovery mode: "latest" to import the most recent session', + }), + date: option({ + type: optional(string), + long: 'date', + description: 'Filter sessions by date (YYYY-MM-DD)', + }), + output: option({ + type: optional(string), + long: 'output', + short: 'o', + description: 'Output file path (default: .agentv/transcripts/codex-.jsonl)', + }), + sessionsDir: option({ + type: optional(string), + long: 'sessions-dir', + description: 'Override the default ~/.codex/sessions directory', + }), + list: flag({ + long: 'list', + description: 'List available sessions instead of importing', + }), + }, + handler: async ({ discover, date, output, sessionsDir, list }) => { + if (list) { + const sessions = await discoverCodexSessions({ + date, + sessionsDir, + limit: 20, + }); + + if (sessions.length === 0) { + console.log('No Codex CLI sessions found.'); + return; + } + + console.log(`Found ${sessions.length} session(s):\n`); + for (const session of sessions) { + const age = formatAge(session.updatedAt); + console.log(` ${session.sessionId} ${age} ${session.filename}`); + } + return; + } + + if (discover !== 'latest') { + console.error('Error: specify --discover latest to select a session.'); + process.exit(1); + } + + const sessions = await discoverCodexSessions({ + date, + sessionsDir, + latest: true, + }); + + if (sessions.length === 0) { + console.error('Error: no Codex CLI sessions found.'); + process.exit(1); + } + + const session = sessions[0]; + console.log(`Discovered latest session: ${session.filename}`); + + // Parse the session + const rawJsonl = await readTranscriptFile(session.filePath); + const transcript = parseCodexSession(rawJsonl); + + // Determine output path + const shortId = session.sessionId.slice(0, 8); + const outputPath = output ?? path.join('.agentv', 'transcripts', `codex-${shortId}.jsonl`); + + // Ensure output directory exists + await mkdir(path.dirname(outputPath), { recursive: true }); + + // Write transcript as JSONL (snake_case wire format) + const jsonLine = toTranscriptJsonLine(transcript); + await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8'); + + const msgCount = transcript.messages.length; + const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0); + + console.log(`Imported ${msgCount} messages (${toolCount} tool calls) → ${outputPath}`); + + if (transcript.source.model) { + console.log(` Model: ${transcript.source.model}`); + } + if (transcript.durationMs !== undefined) { + console.log(` Duration: ${formatDurationMs(transcript.durationMs)}`); + } + }, +}); + +function formatAge(date: Date): string { + const diffMs = Date.now() - date.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + if (diffMin < 60) return `${diffMin}m ago`; + const diffHours = Math.floor(diffMin / 60); + if (diffHours < 24) return `${diffHours}h ago`; + const diffDays = Math.floor(diffHours / 24); + return `${diffDays}d ago`; +} + +function formatDurationMs(ms: number): string { + if (ms < 1000) return `${ms}ms`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; +} diff --git a/apps/cli/src/commands/import/copilot.ts b/apps/cli/src/commands/import/copilot.ts new file mode 100644 index 000000000..dab154120 --- /dev/null +++ b/apps/cli/src/commands/import/copilot.ts @@ -0,0 +1,158 @@ +import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { discoverCopilotSessions, parseCopilotEvents, toTranscriptJsonLine } from '@agentv/core'; +import { command, flag, option, optional, string } from 'cmd-ts'; + +export const importCopilotCommand = command({ + name: 'copilot', + description: 'Import a Copilot CLI session transcript for offline grading', + args: { + sessionId: option({ + type: optional(string), + long: 'session-id', + description: 'UUID of the Copilot CLI session to import', + }), + discover: option({ + type: optional(string), + long: 'discover', + description: 'Discovery mode: "latest" to import the most recent session', + }), + output: option({ + type: optional(string), + long: 'output', + short: 'o', + description: + 'Output file path (default: .agentv/transcripts/copilot-.jsonl)', + }), + sessionStateDir: option({ + type: optional(string), + long: 'session-state-dir', + description: 'Override the default ~/.copilot/session-state directory', + }), + list: flag({ + long: 'list', + description: 'List available sessions instead of importing', + }), + }, + handler: async ({ sessionId, discover, output, sessionStateDir, list }) => { + if (list) { + const sessions = await discoverCopilotSessions({ + sessionStateDir, + limit: 20, + }); + + if (sessions.length === 0) { + console.log('No Copilot CLI sessions found.'); + return; + } + + console.log(`Found ${sessions.length} session(s):\n`); + for (const session of sessions) { + const age = formatAge(session.updatedAt); + const status = session.isActive ? ' (active)' : ''; + console.log(` ${session.sessionId} ${age} ${session.cwd}${status}`); + } + return; + } + + let sessionDir: string; + let resolvedSessionId: string; + + if (sessionId) { + const sessions = await discoverCopilotSessions({ + sessionStateDir, + limit: 100, + }); + const match = sessions.find((s: { sessionId: string }) => s.sessionId === sessionId); + if (!match) { + console.error(`Error: session ${sessionId} not found.`); + process.exit(1); + } + sessionDir = match.sessionDir; + resolvedSessionId = sessionId; + } else if (discover === 'latest') { + const sessions = await discoverCopilotSessions({ + sessionStateDir, + limit: 1, + }); + + if (sessions.length === 0) { + console.error('Error: no Copilot CLI sessions found.'); + process.exit(1); + } + sessionDir = sessions[0].sessionDir; + resolvedSessionId = sessions[0].sessionId; + console.log(`Discovered latest session: ${resolvedSessionId}`); + } else { + console.error('Error: specify --session-id or --discover latest to select a session.'); + process.exit(1); + } + + // Parse the session + const eventsPath = path.join(sessionDir, 'events.jsonl'); + const rawJsonl = await readFile(eventsPath, 'utf8'); + const parsed = parseCopilotEvents(rawJsonl); + + // Convert to TranscriptEntry format + const transcript = { + messages: parsed.messages, + source: { + provider: 'copilot' as const, + sessionId: resolvedSessionId, + cwd: parsed.meta.cwd, + startedAt: parsed.meta.startedAt, + model: parsed.meta.model, + }, + tokenUsage: parsed.tokenUsage, + durationMs: parsed.durationMs, + costUsd: null as number | null, + }; + + // Determine output path + const shortId = resolvedSessionId.slice(0, 8); + const outputPath = output ?? path.join('.agentv', 'transcripts', `copilot-${shortId}.jsonl`); + + // Ensure output directory exists + await mkdir(path.dirname(outputPath), { recursive: true }); + + // Write transcript as JSONL (snake_case wire format) + const jsonLine = toTranscriptJsonLine(transcript); + await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8'); + + const msgCount = transcript.messages.length; + const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0); + + console.log(`Imported ${msgCount} messages (${toolCount} tool calls) → ${outputPath}`); + + if (transcript.source.model) { + console.log(` Model: ${transcript.source.model}`); + } + if (transcript.durationMs !== undefined) { + console.log(` Duration: ${formatDurationMs(transcript.durationMs)}`); + } + if (transcript.tokenUsage) { + console.log( + ` Tokens: ${transcript.tokenUsage.input} in / ${transcript.tokenUsage.output} out`, + ); + } + }, +}); + +function formatAge(date: Date): string { + const diffMs = Date.now() - date.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + if (diffMin < 60) return `${diffMin}m ago`; + const diffHours = Math.floor(diffMin / 60); + if (diffHours < 24) return `${diffHours}h ago`; + const diffDays = Math.floor(diffHours / 24); + return `${diffDays}d ago`; +} + +function formatDurationMs(ms: number): string { + if (ms < 1000) return `${ms}ms`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; +} diff --git a/apps/cli/src/commands/import/index.ts b/apps/cli/src/commands/import/index.ts index d76ddcaf0..84435d4c0 100644 --- a/apps/cli/src/commands/import/index.ts +++ b/apps/cli/src/commands/import/index.ts @@ -1,11 +1,15 @@ import { subcommands } from 'cmd-ts'; import { importClaudeCommand } from './claude.js'; +import { importCodexCommand } from './codex.js'; +import { importCopilotCommand } from './copilot.js'; export const importCommand = subcommands({ name: 'import', description: 'Import agent session transcripts for offline grading', cmds: { claude: importClaudeCommand, + codex: importCodexCommand, + copilot: importCopilotCommand, }, }); diff --git a/apps/cli/src/commands/inspect/index.ts b/apps/cli/src/commands/inspect/index.ts new file mode 100644 index 000000000..94d0a0b0e --- /dev/null +++ b/apps/cli/src/commands/inspect/index.ts @@ -0,0 +1,17 @@ +import { subcommands } from 'cmd-ts'; + +import { traceListCommand } from './list.js'; +import { traceScoreCommand } from './score.js'; +import { traceShowCommand } from './show.js'; +import { traceStatsCommand } from './stats.js'; + +export const inspectCommand = subcommands({ + name: 'inspect', + description: 'Inspect and analyze evaluation results', + cmds: { + list: traceListCommand, + score: traceScoreCommand, + show: traceShowCommand, + stats: traceStatsCommand, + }, +}); diff --git a/apps/cli/src/commands/inspect/list.ts b/apps/cli/src/commands/inspect/list.ts new file mode 100644 index 000000000..42bea2b72 --- /dev/null +++ b/apps/cli/src/commands/inspect/list.ts @@ -0,0 +1,93 @@ +import { command, number, oneOf, option, optional, string } from 'cmd-ts'; +import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; +import { + type ResultFileMeta, + c, + formatScore, + formatSize, + listResultFiles, + padLeft, + padRight, +} from './utils.js'; + +function formatListTable(metas: ResultFileMeta[]): string { + const lines: string[] = []; + + if (metas.length === 0) { + lines.push(`${c.yellow}No run workspaces found in .agentv/results/runs/${c.reset}`); + lines.push(`${c.dim}Run an evaluation first: agentv run ${c.reset}`); + return lines.join('\n'); + } + + lines.push(''); + lines.push(`${c.bold}Evaluation Runs${c.reset} ${c.dim}(.agentv/results/runs/)${c.reset}`); + lines.push(''); + + // Column widths + const maxFileLen = Math.max(4, ...metas.map((m) => m.filename.length)); + + // Header + const header = ` ${padRight('File', maxFileLen)} ${padLeft('Tests', 5)} ${padLeft('Pass', 5)} ${padLeft('Score', 6)} ${padLeft('Size', 7)} Timestamp`; + lines.push(`${c.dim}${header}${c.reset}`); + lines.push( + `${c.dim} ${'─'.repeat(maxFileLen)} ${'─'.repeat(5)} ${'─'.repeat(5)} ${'─'.repeat(6)} ${'─'.repeat(7)} ${'─'.repeat(24)}${c.reset}`, + ); + + for (const meta of metas) { + const passColor = meta.passRate >= 1.0 ? c.green : meta.passRate >= 0.5 ? c.yellow : c.red; + const scoreColor = meta.avgScore >= 0.9 ? c.green : meta.avgScore >= 0.5 ? c.yellow : c.red; + + const row = ` ${padRight(meta.filename, maxFileLen)} ${padLeft(String(meta.testCount), 5)} ${padLeft(`${passColor}${formatScore(meta.passRate)}${c.reset}`, 5)} ${padLeft(`${scoreColor}${formatScore(meta.avgScore)}${c.reset}`, 6)} ${padLeft(formatSize(meta.sizeBytes), 7)} ${c.dim}${meta.timestamp}${c.reset}`; + lines.push(row); + } + + lines.push(''); + lines.push( + `${c.dim}${metas.length} run workspace${metas.length !== 1 ? 's' : ''} found${c.reset}`, + ); + lines.push(''); + + return lines.join('\n'); +} + +export const traceListCommand = command({ + name: 'list', + description: 'List recent evaluation run workspaces from .agentv/results/runs/', + args: { + limit: option({ + type: optional(number), + long: 'limit', + short: 'n', + description: 'Maximum number of results to show (default: all)', + }), + format: option({ + type: optional(oneOf(['table', 'json'])), + long: 'format', + short: 'f', + description: 'Output format: table (default) or json', + }), + dir: option({ + type: optional(string), + long: 'dir', + short: 'd', + description: 'Working directory (default: current directory)', + }), + }, + handler: async ({ limit, format, dir }) => { + const cwd = dir ?? process.cwd(); + const outputFormat = format ?? 'table'; + + try { + const metas = listResultFiles(cwd, limit); + + if (outputFormat === 'json') { + console.log(JSON.stringify(toSnakeCaseDeep(metas), null, 2)); + } else { + console.log(formatListTable(metas)); + } + } catch (error) { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + } + }, +}); diff --git a/apps/cli/src/commands/inspect/score.ts b/apps/cli/src/commands/inspect/score.ts new file mode 100644 index 000000000..da986096c --- /dev/null +++ b/apps/cli/src/commands/inspect/score.ts @@ -0,0 +1,401 @@ +import { + type EvalTest, + type EvaluationContext, + type EvaluationScore, + type Evaluator, + type EvaluatorConfig, + type EvaluatorDispatchContext, + type Message, + type Provider, + type ProviderRequest, + type ProviderResponse, + createBuiltinRegistry, + toCamelCaseDeep, +} from '@agentv/core'; +import { command, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { + type RawResult, + c, + formatScore, + loadResultFile, + padLeft, + padRight, + toTraceSummary, +} from './utils.js'; + +/** + * Evaluator types that work without an LLM provider. + */ +const SUPPORTED_TYPES = [ + 'contains', + 'regex', + 'is-json', + 'equals', + 'latency', + 'cost', + 'token-usage', + 'execution-metrics', +] as const; + +/** + * Parse key=value pairs from a string like "max_tool_calls=10,max_tokens=2000" + */ +function parseKeyValues(s: string): Record { + const result: Record = {}; + if (!s) return result; + for (const pair of s.split(',')) { + const eqIdx = pair.indexOf('='); + if (eqIdx === -1) continue; + result[pair.slice(0, eqIdx).trim()] = pair.slice(eqIdx + 1).trim(); + } + return result; +} + +/** + * Parse an inline evaluator spec string into an EvaluatorConfig. + * + * Supported formats: + * contains:value + * regex:pattern + * is-json + * equals:value + * latency: + * cost: + * token-usage:max_total=N,max_input=N,max_output=N + * execution-metrics:max_tool_calls=N,max_tokens=N,max_llm_calls=N,... + */ +export function parseAssertSpec(spec: string): EvaluatorConfig { + const colonIdx = spec.indexOf(':'); + // Normalize snake_case to kebab-case for backward compat + const type = (colonIdx === -1 ? spec : spec.slice(0, colonIdx)).replace(/_/g, '-'); + const params = colonIdx === -1 ? '' : spec.slice(colonIdx + 1); + + switch (type) { + case 'contains': + if (!params) throw new Error('contains requires a value: contains:'); + return { name: 'contains', type: 'contains', value: params } as EvaluatorConfig; + + case 'regex': + if (!params) throw new Error('regex requires a pattern: regex:'); + return { name: 'regex', type: 'regex', value: params } as EvaluatorConfig; + + case 'is-json': + return { name: 'is-json', type: 'is-json' } as EvaluatorConfig; + + case 'equals': + if (!params) throw new Error('equals requires a value: equals:'); + return { name: 'equals', type: 'equals', value: params } as EvaluatorConfig; + + case 'latency': { + const threshold = Number(params); + if (!params || Number.isNaN(threshold)) + throw new Error('latency requires a threshold in ms: latency:'); + return { name: 'latency', type: 'latency', threshold } as EvaluatorConfig; + } + + case 'cost': { + const budget = Number(params); + if (!params || Number.isNaN(budget)) + throw new Error('cost requires a budget in USD: cost:'); + return { name: 'cost', type: 'cost', budget } as EvaluatorConfig; + } + + case 'token-usage': { + const kv = parseKeyValues(params); + const config: Record = { name: 'token-usage', type: 'token-usage' }; + if (kv.max_total) config.max_total = Number(kv.max_total); + if (kv.max_input) config.max_input = Number(kv.max_input); + if (kv.max_output) config.max_output = Number(kv.max_output); + return config as EvaluatorConfig; + } + + case 'execution-metrics': { + const kv = parseKeyValues(params); + const config: Record = { + name: 'execution-metrics', + type: 'execution-metrics', + }; + if (kv.max_tool_calls) config.max_tool_calls = Number(kv.max_tool_calls); + if (kv.max_llm_calls) config.max_llm_calls = Number(kv.max_llm_calls); + if (kv.max_tokens) config.max_tokens = Number(kv.max_tokens); + if (kv.max_cost_usd) config.max_cost_usd = Number(kv.max_cost_usd); + if (kv.max_duration_ms) config.max_duration_ms = Number(kv.max_duration_ms); + return config as EvaluatorConfig; + } + + default: + throw new Error( + `Unsupported evaluator type: "${type}". Supported: ${SUPPORTED_TYPES.join(', ')}`, + ); + } +} + +/** + * Extract candidate answer from a result record. + */ +function extractCandidate(raw: RawResult): string { + if (raw.output !== undefined) + return typeof raw.output === 'string' ? raw.output : JSON.stringify(raw.output); + return ''; +} + +/** + * Build a minimal EvalTest stub from a result record. + * Only used to satisfy the EvaluationContext interface — deterministic and + * trace-based evaluators don't access these fields. + */ +function buildTestCase(raw: RawResult): EvalTest { + return { + id: raw.test_id ?? 'unknown', + question: '', + input: [], + expected_output: [], + + file_paths: [], + criteria: '', + }; +} + +/** + * A no-op provider stub for evaluators that don't call LLM providers. + */ +const stubProvider: Provider = { + id: 'trace-score-stub', + kind: 'mock', + targetName: 'trace-score-stub', + invoke(_request: ProviderRequest): Promise { + throw new Error('trace score does not support LLM-based evaluators'); + }, +}; + +/** + * A no-op evaluator stub used as the required llmGrader in the dispatch context. + */ +const stubLlmGrader: Evaluator = { + kind: 'llm-grader', + evaluate(): EvaluationScore { + throw new Error('trace score does not support LLM-based evaluators'); + }, +}; + +interface ScoreResult { + testId: string; + candidate: string; + originalScore: number; + newScore: number; + verdict: string; + assertions: readonly { text: string; passed: boolean; evidence?: string }[]; +} + +async function runScore( + results: RawResult[], + evaluatorConfig: EvaluatorConfig, + testIdFilter?: string, +): Promise { + const registry = createBuiltinRegistry(); + + const dispatchContext: EvaluatorDispatchContext = { + llmGrader: stubLlmGrader, + registry, + }; + + const evaluator = await registry.create(evaluatorConfig, dispatchContext); + const scored: ScoreResult[] = []; + + for (const raw of results) { + if (testIdFilter && raw.test_id !== testIdFilter) continue; + + const trace = toTraceSummary(raw); + const candidate = extractCandidate(raw); + const output = raw.output as readonly Message[] | undefined; + + const evalContext: EvaluationContext = { + evalCase: buildTestCase(raw), + candidate, + target: { kind: 'custom' as const, name: raw.target ?? 'unknown', config: {} } as never, + provider: stubProvider, + attempt: 1, + promptInputs: { question: '' }, + now: new Date(), + output: Array.isArray(output) ? output : undefined, + trace, + tokenUsage: raw.token_usage + ? (toCamelCaseDeep(raw.token_usage) as EvaluationContext['tokenUsage']) + : undefined, + costUsd: raw.cost_usd, + durationMs: raw.duration_ms, + startTime: raw.start_time, + endTime: raw.end_time, + }; + + const score = await evaluator.evaluate(evalContext); + scored.push({ + testId: raw.test_id ?? 'unknown', + candidate: candidate.slice(0, 80), + originalScore: raw.score, + newScore: score.score, + verdict: score.verdict, + assertions: score.assertions, + }); + } + + return scored; +} + +function renderTable(scored: ScoreResult[], assertSpec: string): string { + const lines: string[] = []; + + // Header + const cols = [ + { header: 'Test ID', width: 24 }, + { header: 'Orig', width: 6 }, + { header: 'New', width: 6 }, + { header: 'Verdict', width: 8 }, + { header: 'Detail', width: 50 }, + ]; + + const headerLine = cols + .map((col) => padRight(`${c.bold}${col.header}${c.reset}`, col.width)) + .join(' '); + lines.push(headerLine); + lines.push(cols.map((col) => '─'.repeat(col.width)).join('──')); + + for (const r of scored) { + const verdictColor = r.verdict === 'pass' ? c.green : c.red; + const failed = r.assertions.filter((a) => !a.passed); + const passed = r.assertions.filter((a) => a.passed); + const detail = + failed.length > 0 + ? failed[0].text.slice(0, 48) + : passed.length > 0 + ? passed[0].text.slice(0, 48) + : ''; + + const row = [ + padRight(r.testId.slice(0, 24), cols[0].width), + padLeft(formatScore(r.originalScore), cols[1].width), + padLeft(`${verdictColor}${formatScore(r.newScore)}${c.reset}`, cols[2].width), + padRight(`${verdictColor}${r.verdict.toUpperCase()}${c.reset}`, cols[3].width), + detail.slice(0, cols[4].width), + ].join(' '); + lines.push(row); + } + + // Summary + const passCount = scored.filter((r) => r.verdict === 'pass').length; + const total = scored.length; + const meanScore = total > 0 ? scored.reduce((sum, r) => sum + r.newScore, 0) / total : 0; + lines.push(''); + lines.push( + `${c.bold}Assert:${c.reset} ${assertSpec} ${c.bold}Results:${c.reset} ${passCount}/${total} passed (${formatScore(passCount / (total || 1))}) ${c.bold}Mean:${c.reset} ${formatScore(meanScore)}`, + ); + + return lines.join('\n'); +} + +export const traceScoreCommand = command({ + name: 'score', + description: 'Run evaluators against existing trace sources post-hoc', + args: { + file: positional({ + type: string, + displayName: 'trace-source', + description: + 'Path to a run workspace, result manifest, simple trace JSONL, or OTLP JSON file', + }), + assert: option({ + type: string, + long: 'assert', + short: 'a', + description: + 'Evaluator spec: contains:, regex:, is-json, equals:, latency:, cost:, token-usage:, execution-metrics:', + }), + testId: option({ + type: optional(string), + long: 'test-id', + description: 'Filter to a specific test ID', + }), + format: option({ + type: optional(oneOf(['json', 'table'])), + long: 'format', + short: 'f', + description: 'Output format (default: table)', + }), + }, + handler: async ({ file, assert: assertSpec, testId, format }) => { + // Parse the evaluator spec + let evaluatorConfig: EvaluatorConfig; + try { + evaluatorConfig = parseAssertSpec(assertSpec); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`${c.red}Error:${c.reset} ${msg}`); + process.exit(1); + } + + // Load results + let results: RawResult[]; + try { + results = loadResultFile(file); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`${c.red}Error:${c.reset} Could not load result file: ${msg}`); + process.exit(1); + } + + if (results.length === 0) { + console.error(`${c.yellow}Warning:${c.reset} No results found in ${file}`); + process.exit(0); + } + + // Check for trace data if evaluator needs it + const traceRequired = ['latency', 'cost', 'token-usage', 'execution-metrics'].includes( + evaluatorConfig.type, + ); + if (traceRequired) { + const hasTrace = results.some( + (r) => + toTraceSummary(r) || + r.cost_usd !== undefined || + r.duration_ms !== undefined || + r.token_usage !== undefined, + ); + if (!hasTrace) { + console.error( + `${c.red}Error:${c.reset} Source lacks trace metrics. Use an OTLP trace export via ${c.bold}--otel-file${c.reset} or a run manifest with summary metrics in ${c.bold}index.jsonl${c.reset}.`, + ); + process.exit(1); + } + } + + // Run scoring + let scored: ScoreResult[]; + try { + scored = await runScore(results, evaluatorConfig, testId); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`${c.red}Error:${c.reset} Scoring failed: ${msg}`); + process.exit(1); + } + + if (scored.length === 0) { + console.error( + `${c.yellow}Warning:${c.reset} No results matched${testId ? ` test ID "${testId}"` : ''}`, + ); + process.exit(0); + } + + // Output + if (format === 'json') { + console.log(JSON.stringify(scored, null, 2)); + } else { + console.log(renderTable(scored, assertSpec)); + } + + // Exit with non-zero if any failed + const hasFailures = scored.some((r) => r.verdict !== 'pass'); + if (hasFailures) { + process.exit(1); + } + }, +}); diff --git a/apps/cli/src/commands/inspect/show.ts b/apps/cli/src/commands/inspect/show.ts new file mode 100644 index 000000000..50e12f7e7 --- /dev/null +++ b/apps/cli/src/commands/inspect/show.ts @@ -0,0 +1,362 @@ +import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { + type RawResult, + c, + formatCost, + formatDuration, + formatNumber, + formatScore, + getTraceSpans, + getTraceSummary, + loadResultFile, +} from './utils.js'; + +/** + * Render flat trace summary line (fallback when full output messages not available). + */ +function renderFlatTrace(result: RawResult): string { + const trace = getTraceSummary(result); + const parts: string[] = []; + + if (trace?.tool_calls && Object.keys(trace.tool_calls).length > 0) { + const toolParts = Object.entries(trace.tool_calls).map(([name, count]) => { + return count > 1 ? `${name} ×${count}` : name; + }); + parts.push(`Tools: ${toolParts.join(', ')}`); + } + + if (result.duration_ms !== undefined) { + parts.push(`Duration: ${formatDuration(result.duration_ms)}`); + } + + if (result.token_usage) { + const total = result.token_usage.input + result.token_usage.output; + parts.push(`Tokens: ${formatNumber(total)}`); + } + + if (result.cost_usd !== undefined) { + parts.push(`Cost: ${formatCost(result.cost_usd)}`); + } + + if (trace?.llm_call_count !== undefined) { + parts.push(`LLM calls: ${trace.llm_call_count}`); + } + + return parts.join(' | '); +} + +/** + * Render per-evaluator scores inline. + */ +function renderScores(scores: { name: string; score: number; type: string }[]): string { + return scores + .map((s) => { + const scoreColor = s.score >= 0.9 ? c.green : s.score >= 0.5 ? c.yellow : c.red; + return `${s.name} ${scoreColor}${formatScore(s.score)}${c.reset}`; + }) + .join(' | '); +} + +// Raw output message shape (snake_case from JSONL) +interface RawMessage { + role?: string; + content?: unknown; + tool_calls?: RawToolCall[]; + start_time?: string; + end_time?: string; + duration_ms?: number; + token_usage?: { input: number; output: number; cached?: number }; +} + +interface RawToolCall { + tool: string; + input?: unknown; + output?: unknown; + start_time?: string; + end_time?: string; + duration_ms?: number; +} + +/** + * Render tree view from output messages. + * Shows a hierarchical trace: LLM calls → tool calls. + */ +function renderTree(result: RawResult): string { + const messages = result.output as RawMessage[] | undefined; + const spans = getTraceSpans(result); + + if (!messages || messages.length === 0) { + if (spans.length > 0) { + return renderSpanTree(result, spans); + } + // Fallback to flat summary + if ( + getTraceSummary(result) || + result.duration_ms !== undefined || + result.cost_usd !== undefined + ) { + return renderFlatTrace(result); + } + return `${c.dim}No trace data available${c.reset}`; + } + + const lines: string[] = []; + const testId = result.test_id ?? result.eval_id ?? 'unknown'; + + // Root node: test execution + const totalDuration = result.duration_ms; + const totalTokens = result.token_usage + ? result.token_usage.input + result.token_usage.output + : undefined; + const rootParts: string[] = [testId]; + if (totalDuration !== undefined) rootParts.push(formatDuration(totalDuration)); + if (totalTokens !== undefined) rootParts.push(`${formatNumber(totalTokens)} tok`); + if (result.cost_usd !== undefined) rootParts.push(formatCost(result.cost_usd)); + lines.push(`${c.bold}${rootParts.join(', ')}${c.reset}`); + + // Filter to meaningful messages (assistant with tool calls, or assistant responses) + const steps: { type: 'llm' | 'tools'; msg: RawMessage; index: number }[] = []; + for (let i = 0; i < messages.length; i++) { + const msg = messages[i]; + if (msg.role === 'assistant') { + if (msg.tool_calls && msg.tool_calls.length > 0) { + steps.push({ type: 'tools', msg, index: i }); + } else { + steps.push({ type: 'llm', msg, index: i }); + } + } + } + + for (let si = 0; si < steps.length; si++) { + const step = steps[si]; + const isLast = si === steps.length - 1; + const connector = isLast ? '└─' : '├─'; + const childPrefix = isLast ? ' ' : '│ '; + + if (step.type === 'llm') { + // LLM response without tool calls + const parts: string[] = [`${c.cyan}model${c.reset}`]; + if (step.msg.duration_ms !== undefined) parts.push(formatDuration(step.msg.duration_ms)); + if (step.msg.token_usage) { + const tok = step.msg.token_usage.input + step.msg.token_usage.output; + parts.push(`${formatNumber(tok)} tok`); + } + lines.push(`${connector} ${parts.join(', ')}`); + } else { + // Tool calls + const toolCalls = step.msg.tool_calls ?? []; + + if (toolCalls.length === 1) { + // Single tool call — inline + const tc = toolCalls[0]; + const parts: string[] = [`${c.yellow}${tc.tool}${c.reset}`]; + if (tc.duration_ms !== undefined) parts.push(formatDuration(tc.duration_ms)); + lines.push(`${connector} ${parts.join(', ')}`); + } else { + // Multiple tool calls — expand + const parts: string[] = [`${c.dim}tools${c.reset}`]; + if (step.msg.duration_ms !== undefined) parts.push(formatDuration(step.msg.duration_ms)); + lines.push(`${connector} ${parts.join(', ')}`); + + for (let ti = 0; ti < toolCalls.length; ti++) { + const tc = toolCalls[ti]; + const isLastTool = ti === toolCalls.length - 1; + const toolConnector = isLastTool ? '└─' : '├─'; + const tcParts: string[] = [`${c.yellow}${tc.tool}${c.reset}`]; + if (tc.duration_ms !== undefined) tcParts.push(formatDuration(tc.duration_ms)); + lines.push(`${childPrefix}${toolConnector} ${tcParts.join(', ')}`); + } + } + } + } + + // Scores line + if (result.scores && result.scores.length > 0) { + lines.push(''); + lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`); + } + + return lines.join('\n'); +} + +function renderSpanTree(result: RawResult, spans: ReturnType): string { + const lines: string[] = []; + const testId = result.test_id ?? result.eval_id ?? 'unknown'; + const totalTokens = result.token_usage + ? result.token_usage.input + result.token_usage.output + : undefined; + const rootParts: string[] = [testId]; + if (result.duration_ms !== undefined) rootParts.push(formatDuration(result.duration_ms)); + if (totalTokens !== undefined) rootParts.push(`${formatNumber(totalTokens)} tok`); + if (result.cost_usd !== undefined) rootParts.push(formatCost(result.cost_usd)); + lines.push(`${c.bold}${rootParts.join(', ')}${c.reset}`); + + spans.forEach((span, index) => { + const connector = index === spans.length - 1 ? '└─' : '├─'; + const color = span.type === 'llm' ? c.cyan : c.yellow; + const parts = [`${color}${span.name}${c.reset}`]; + if (span.duration_ms !== undefined) { + parts.push(formatDuration(span.duration_ms)); + } + lines.push(`${connector} ${parts.join(', ')}`); + }); + + if (result.scores && result.scores.length > 0) { + lines.push(''); + lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`); + } + + return lines.join('\n'); +} + +/** + * Format a single result for table display. + */ +function formatResultDetail(result: RawResult, index: number, tree: boolean): string { + const lines: string[] = []; + const testId = result.test_id ?? result.eval_id ?? `result-${index}`; + + if (tree) { + // Tree view + lines.push(renderTree(result)); + return lines.join('\n'); + } + + // Standard flat view + const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red; + lines.push( + `${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.suite ? ` ${c.dim}suite: ${result.suite}${c.reset}` : ''}`, + ); + + if (result.error) { + lines.push(` ${c.red}Error: ${result.error}${c.reset}`); + } + + if (result.assertions && result.assertions.length > 0) { + const passed = result.assertions.filter((a: { passed: boolean }) => a.passed); + const failed = result.assertions.filter((a: { passed: boolean }) => !a.passed); + if (passed.length > 0) + lines.push( + ` ${c.green}✓ Passed:${c.reset} ${passed.map((a: { text: string }) => a.text).join(', ')}`, + ); + if (failed.length > 0) + lines.push( + ` ${c.red}✗ Failed:${c.reset} ${failed.map((a: { text: string }) => a.text).join(', ')}`, + ); + } + + if (result.scores && result.scores.length > 0) { + lines.push(` ${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`); + } + + if (result.trace || result.duration_ms !== undefined || result.cost_usd !== undefined) { + lines.push(` ${c.dim}Trace:${c.reset} ${renderFlatTrace(result)}`); + } + + if (result.assertions && result.assertions.length > 0) { + const withEvidence = result.assertions.filter((a: { evidence?: string }) => a.evidence); + if (withEvidence.length > 0) { + const maxLen = 200; + const evidence = (withEvidence[0] as { evidence: string }).evidence; + const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence; + lines.push(` ${c.dim}Evidence: ${truncated}${c.reset}`); + } + } + + return lines.join('\n'); +} + +function formatShowTable( + results: RawResult[], + filePath: string, + testIdFilter?: string, + tree?: boolean, +): string { + const lines: string[] = []; + + let filtered = results; + if (testIdFilter) { + filtered = results.filter((r) => (r.test_id ?? r.eval_id) === testIdFilter); + if (filtered.length === 0) { + lines.push(`${c.yellow}No results found with test ID "${testIdFilter}"${c.reset}`); + lines.push(''); + lines.push(`${c.dim}Available test IDs:${c.reset}`); + for (const r of results) { + lines.push(` ${r.test_id ?? r.eval_id ?? '(unnamed)'}`); + } + return lines.join('\n'); + } + } + + lines.push(''); + lines.push(`${c.bold}Results:${c.reset} ${c.cyan}${filePath}${c.reset}`); + + const totalTests = filtered.length; + const passCount = filtered.filter((r) => r.score >= 1.0).length; + const failCount = totalTests - passCount; + const avgScore = totalTests > 0 ? filtered.reduce((sum, r) => sum + r.score, 0) / totalTests : 0; + + lines.push( + `${c.dim}${totalTests} test${totalTests !== 1 ? 's' : ''} | ${c.green}${passCount} passed${c.reset}${c.dim}${failCount > 0 ? ` | ${c.red}${failCount} failed${c.reset}${c.dim}` : ''} | avg score: ${formatScore(avgScore)}${c.reset}`, + ); + lines.push(''); + + for (let i = 0; i < filtered.length; i++) { + lines.push(formatResultDetail(filtered[i], i, tree ?? false)); + if (i < filtered.length - 1) { + lines.push(`${c.dim}${'─'.repeat(60)}${c.reset}`); + } + } + + lines.push(''); + return lines.join('\n'); +} + +export const traceShowCommand = command({ + name: 'show', + description: 'Show evaluation results with trace details from a result file', + args: { + file: positional({ + type: string, + displayName: 'trace-source', + description: + 'Path to a run workspace, result manifest, simple trace JSONL, or OTLP JSON file', + }), + testId: option({ + type: optional(string), + long: 'test-id', + description: 'Filter to a specific test ID', + }), + tree: flag({ + long: 'tree', + description: 'Show hierarchical trace tree from output messages or exported trace spans', + }), + format: option({ + type: optional(oneOf(['table', 'json'])), + long: 'format', + short: 'f', + description: 'Output format: table (default) or json', + }), + }, + handler: async ({ file, testId, tree, format }) => { + const outputFormat = format ?? 'table'; + + try { + const results = loadResultFile(file); + + let filtered = results; + if (testId) { + filtered = results.filter((r) => (r.test_id ?? r.eval_id) === testId); + } + + if (outputFormat === 'json') { + console.log(JSON.stringify(filtered, null, 2)); + } else { + console.log(formatShowTable(results, file, testId, tree)); + } + } catch (error) { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + } + }, +}); diff --git a/apps/cli/src/commands/inspect/stats.ts b/apps/cli/src/commands/inspect/stats.ts new file mode 100644 index 000000000..cf3df312c --- /dev/null +++ b/apps/cli/src/commands/inspect/stats.ts @@ -0,0 +1,261 @@ +import { command, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; +import { + type RawResult, + c, + formatCost, + formatNumber, + getTraceSummary, + loadResultFile, + padLeft, + padRight, +} from './utils.js'; + +/** + * Compute percentiles from a sorted array of numbers. + */ +export function percentile(sorted: number[], p: number): number { + if (sorted.length === 0) return 0; + const index = (p / 100) * (sorted.length - 1); + const lower = Math.floor(index); + const upper = Math.ceil(index); + if (lower === upper) return sorted[lower]; + return sorted[lower] + (sorted[upper] - sorted[lower]) * (index - lower); +} + +function mean(values: number[]): number { + if (values.length === 0) return 0; + return values.reduce((sum, v) => sum + v, 0) / values.length; +} + +interface MetricRow { + name: string; + values: number[]; + formatter: (n: number) => string; +} + +function collectMetrics(results: RawResult[]): MetricRow[] { + const rows: MetricRow[] = []; + + // Score + const scores = results.map((r) => r.score); + if (scores.length > 0) { + rows.push({ name: 'score', values: scores, formatter: (n) => n.toFixed(2) }); + } + + // Latency + const latencies = results.map((r) => r.duration_ms).filter((v): v is number => v !== undefined); + if (latencies.length > 0) { + rows.push({ + name: 'latency_s', + values: latencies.map((ms) => ms / 1000), + formatter: (n) => n.toFixed(1), + }); + } + + // Cost + const costs = results.map((r) => r.cost_usd).filter((v): v is number => v !== undefined); + if (costs.length > 0) { + rows.push({ name: 'cost_usd', values: costs, formatter: (n) => formatCost(n) }); + } + + // Total tokens + const tokens = results + .map((r) => { + if (!r.token_usage) return undefined; + return r.token_usage.input + r.token_usage.output; + }) + .filter((v): v is number => v !== undefined); + if (tokens.length > 0) { + rows.push({ + name: 'tokens_total', + values: tokens, + formatter: (n) => formatNumber(Math.round(n)), + }); + } + + // Tool calls + const toolCalls = results + .map((r) => getTraceSummary(r)?.event_count) + .filter((v): v is number => v !== undefined); + if (toolCalls.length > 0) { + rows.push({ name: 'tool_calls', values: toolCalls, formatter: (n) => String(Math.round(n)) }); + } + + // LLM calls + const llmCalls = results + .map((r) => getTraceSummary(r)?.llm_call_count) + .filter((v): v is number => v !== undefined); + if (llmCalls.length > 0) { + rows.push({ name: 'llm_calls', values: llmCalls, formatter: (n) => String(Math.round(n)) }); + } + + return rows; +} + +interface GroupedResults { + label: string; + results: RawResult[]; +} + +function groupResults(results: RawResult[], groupBy?: string): GroupedResults[] { + if (!groupBy) return [{ label: 'all', results }]; + + const groups = new Map(); + + for (const result of results) { + let key: string; + switch (groupBy) { + case 'target': + key = result.target ?? 'unknown'; + break; + case 'suite': + key = result.suite ?? 'unknown'; + break; + case 'test-id': + key = result.test_id ?? result.eval_id ?? 'unknown'; + break; + default: + key = 'all'; + } + if (!groups.has(key)) groups.set(key, []); + groups.get(key)?.push(result); + } + + return [...groups.entries()] + .sort(([a], [b]) => a.localeCompare(b)) + .map(([label, results]) => ({ label, results })); +} + +function formatStatsTable(groups: GroupedResults[], filePath: string): string { + const lines: string[] = []; + + lines.push(''); + lines.push(`${c.bold}Statistics:${c.reset} ${c.cyan}${filePath}${c.reset}`); + + for (const group of groups) { + if (groups.length > 1 || group.label !== 'all') { + lines.push(''); + lines.push( + `${c.bold}Group: ${group.label}${c.reset} ${c.dim}(${group.results.length} tests)${c.reset}`, + ); + } else { + lines.push(`${c.dim}${group.results.length} tests${c.reset}`); + } + lines.push(''); + + const metrics = collectMetrics(group.results); + + if (metrics.length === 0) { + lines.push(`${c.yellow}No trace metrics available${c.reset}`); + continue; + } + + // Column headers + const nameWidth = Math.max(12, ...metrics.map((m) => m.name.length)); + const colWidth = 10; + + const header = ` ${padRight('Metric', nameWidth)} ${padLeft('Mean', colWidth)} ${padLeft('P50', colWidth)} ${padLeft('P90', colWidth)} ${padLeft('P95', colWidth)} ${padLeft('P99', colWidth)}`; + lines.push(`${c.dim}${header}${c.reset}`); + lines.push( + `${c.dim} ${'─'.repeat(nameWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)}${c.reset}`, + ); + + for (const metric of metrics) { + const sorted = [...metric.values].sort((a, b) => a - b); + const row = ` ${padRight(metric.name, nameWidth)} ${padLeft(metric.formatter(mean(sorted)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 50)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 90)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 95)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 99)), colWidth)}`; + lines.push(row); + } + } + + lines.push(''); + return lines.join('\n'); +} + +interface StatsJson { + file: string; + groups: { + label: string; + count: number; + metrics: Record; + }[]; +} + +function computeStatsJson(groups: GroupedResults[], filePath: string): StatsJson { + return { + file: filePath, + groups: groups.map((group) => { + const metrics = collectMetrics(group.results); + const metricsObj: Record< + string, + { mean: number; p50: number; p90: number; p95: number; p99: number } + > = {}; + + for (const metric of metrics) { + const sorted = [...metric.values].sort((a, b) => a - b); + metricsObj[metric.name] = { + mean: Number(mean(sorted).toFixed(4)), + p50: Number(percentile(sorted, 50).toFixed(4)), + p90: Number(percentile(sorted, 90).toFixed(4)), + p95: Number(percentile(sorted, 95).toFixed(4)), + p99: Number(percentile(sorted, 99).toFixed(4)), + }; + } + + return { + label: group.label, + count: group.results.length, + metrics: metricsObj, + }; + }), + }; +} + +export const traceStatsCommand = command({ + name: 'stats', + description: 'Compute summary statistics (percentiles) across evaluation results', + args: { + file: positional({ + type: string, + displayName: 'trace-source', + description: + 'Path to a run workspace, result manifest, simple trace JSONL, or OTLP JSON file', + }), + groupBy: option({ + type: optional(oneOf(['target', 'eval-set', 'test-id'])), + long: 'group-by', + short: 'g', + description: 'Group statistics by: target, eval-set, or test-id', + }), + format: option({ + type: optional(oneOf(['table', 'json'])), + long: 'format', + short: 'f', + description: 'Output format: table (default) or json', + }), + }, + handler: async ({ file, groupBy, format }) => { + const outputFormat = format ?? 'table'; + + try { + const results = loadResultFile(file); + + if (results.length === 0) { + console.error('Error: Result file is empty'); + process.exit(1); + } + + const groups = groupResults(results, groupBy); + + if (outputFormat === 'json') { + const statsJson = computeStatsJson(groups, file); + console.log(JSON.stringify(toSnakeCaseDeep(statsJson), null, 2)); + } else { + console.log(formatStatsTable(groups, file)); + } + } catch (error) { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + } + }, +}); diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts new file mode 100644 index 000000000..f10a97ab4 --- /dev/null +++ b/apps/cli/src/commands/inspect/utils.ts @@ -0,0 +1,646 @@ +import { readFileSync, readdirSync, statSync } from 'node:fs'; +import path from 'node:path'; +import type { EvaluationResult, TraceSummary } from '@agentv/core'; +import { PASS_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; +import { + RESULT_INDEX_FILENAME, + RESULT_RUNS_DIRNAME, + resolveExistingRunPrimaryPath, + resolveWorkspaceOrFilePath, +} from '../eval/result-layout.js'; +import { loadManifestResults } from '../results/manifest.js'; + +// ANSI color codes (no dependency needed) +const colors = { + reset: '\x1b[0m', + bold: '\x1b[1m', + dim: '\x1b[2m', + green: '\x1b[32m', + red: '\x1b[31m', + yellow: '\x1b[33m', + cyan: '\x1b[36m', + gray: '\x1b[90m', +}; + +const noColor = process.env.NO_COLOR !== undefined || !process.stdout.isTTY; +export const c = noColor + ? (Object.fromEntries(Object.keys(colors).map((k) => [k, ''])) as typeof colors) + : colors; + +// Regex to strip ANSI escape codes +const ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, 'g'); + +export function stripAnsi(str: string): string { + return str.replace(ansiPattern, ''); +} + +export function padRight(str: string, len: number): string { + const plainLen = stripAnsi(str).length; + return str + ' '.repeat(Math.max(0, len - plainLen)); +} + +export function padLeft(str: string, len: number): string { + const plainLen = stripAnsi(str).length; + return ' '.repeat(Math.max(0, len - plainLen)) + str; +} + +/** + * A raw JSONL result record with snake_case keys as stored on disk. + */ +export interface RawResult { + timestamp?: string; + test_id?: string; + eval_id?: string; + suite?: string; + conversation_id?: string; + score: number; + assertions?: { text: string; passed: boolean; evidence?: string }[]; + target?: string; + error?: string; + scores?: RawEvaluatorScore[]; + trace?: RawTraceSummary; + // Promoted execution metrics (snake_case from JSONL) + token_usage?: { input: number; output: number; cached?: number }; + cost_usd?: number; + duration_ms?: number; + start_time?: string; + end_time?: string; + input?: unknown; + output?: unknown; + spans?: RawTraceSpan[]; + trials?: unknown[]; + aggregation?: unknown; + file_changes?: string; +} + +export interface RawEvaluatorScore { + name: string; + type: string; + score: number; + assertions?: { text: string; passed: boolean; evidence?: string }[]; + weight?: number; +} + +export interface RawTraceSummary { + event_count?: number; + tool_calls?: Record; + error_count?: number; + tool_durations?: Record; + llm_call_count?: number; + // Execution metrics (present when trace includes provider metrics) + token_usage?: { input: number; output: number; cached?: number }; + cost_usd?: number; + duration_ms?: number; +} + +export interface RawTraceSpan { + type?: 'tool' | 'llm' | string; + name: string; + duration_ms?: number; +} + +/** + * Load all result or trace records from a supported source. + * + * Supported sources: + * - Run workspace directories / index.jsonl manifests + * - Standalone trace JSONL files for trace-only workflows + * - OTLP JSON trace files written via --otel-file + */ +export function loadResultFile(filePath: string): RawResult[] { + const resolvedFilePath = resolveTraceResultPath(filePath); + + if (path.extname(resolvedFilePath) === '.json') { + return loadOtlpTraceFile(resolvedFilePath); + } + + if (path.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) { + return loadManifestAsRawResults(resolvedFilePath); + } + + return loadJsonlRecords(resolvedFilePath); +} + +function resolveTraceResultPath(filePath: string): string { + return resolveWorkspaceOrFilePath(filePath); +} + +function loadJsonlRecords(filePath: string): RawResult[] { + const content = readFileSync(filePath, 'utf8'); + const lines = content + .trim() + .split('\n') + .filter((line) => line.trim()); + + return lines.map((line, i) => { + const record = JSON.parse(line) as RawResult; + if (typeof record.score !== 'number') { + throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`); + } + return record; + }); +} + +function loadManifestAsRawResults(filePath: string): RawResult[] { + return loadManifestResults(filePath).map(toRawResult); +} + +function toRawResult(result: EvaluationResult): RawResult { + return { + timestamp: result.timestamp, + test_id: result.testId, + suite: result.suite, + conversation_id: result.conversationId, + score: result.score, + assertions: result.assertions?.map((assertion) => ({ + text: assertion.text, + passed: assertion.passed, + evidence: assertion.evidence, + })), + target: result.target, + error: result.error, + scores: result.scores?.map((score) => ({ + name: score.name, + type: score.type, + score: score.score, + assertions: score.assertions?.map((assertion) => ({ + text: assertion.text, + passed: assertion.passed, + evidence: assertion.evidence, + })), + weight: score.weight, + })), + token_usage: result.tokenUsage + ? { + input: result.tokenUsage.input, + output: result.tokenUsage.output, + cached: result.tokenUsage.cached, + } + : undefined, + cost_usd: result.costUsd, + duration_ms: result.durationMs, + start_time: result.startTime, + end_time: result.endTime, + input: result.input, + output: result.output, + file_changes: result.fileChanges, + }; +} + +type OtlpAttributeValue = + | { stringValue?: string; intValue?: number | string; doubleValue?: number; boolValue?: boolean } + | { arrayValue?: { values?: OtlpAttributeValue[] } }; + +interface OtlpAttribute { + key: string; + value: OtlpAttributeValue; +} + +interface OtlpEvent { + name?: string; + attributes?: OtlpAttribute[]; +} + +interface OtlpSpan { + traceId?: string; + spanId?: string; + parentSpanId?: string; + name?: string; + startTimeUnixNano?: string; + endTimeUnixNano?: string; + attributes?: OtlpAttribute[]; + status?: { code?: number; message?: string }; + events?: OtlpEvent[]; +} + +function loadOtlpTraceFile(filePath: string): RawResult[] { + const parsed = JSON.parse(readFileSync(filePath, 'utf8')) as { + resourceSpans?: { scopeSpans?: { spans?: OtlpSpan[] }[] }[]; + }; + + const spans = parsed.resourceSpans + ?.flatMap((resource) => resource.scopeSpans ?? []) + .flatMap((scope) => scope.spans ?? []); + + if (!spans || spans.length === 0) { + return []; + } + + const spanMap = new Map(); + const childMap = new Map(); + + for (const span of spans) { + if (!span.spanId) continue; + spanMap.set(span.spanId, span); + if (span.parentSpanId) { + const siblings = childMap.get(span.parentSpanId) ?? []; + siblings.push(span); + childMap.set(span.parentSpanId, siblings); + } + } + + const roots = spans.filter((span) => !span.parentSpanId || !spanMap.has(span.parentSpanId)); + const supportedRoots = roots.filter(isAgentvEvalRoot); + const candidateRoots = supportedRoots.length > 0 ? supportedRoots : roots; + + return candidateRoots.map((root, index) => { + const descendants = collectChildSpans(root.spanId, childMap); + const rootAttrs = parseOtlpAttributes(root.attributes); + const parsedDescendants = descendants.map((span) => ({ + ...span, + parsedAttributes: parseOtlpAttributes(span.attributes), + })); + const toolSpans = parsedDescendants.filter( + (span) => typeof span.parsedAttributes.gen_ai_tool_name === 'string', + ); + const llmSpans = parsedDescendants.filter( + (span) => + span.parsedAttributes.gen_ai_operation_name === 'chat' || + (typeof span.name === 'string' && span.name.startsWith('chat ')), + ); + const tokenUsage = descendants.reduce( + (acc, span) => { + const attrs = parseOtlpAttributes(span.attributes); + acc.input += numberAttr(attrs.gen_ai_usage_input_tokens) ?? 0; + acc.output += numberAttr(attrs.gen_ai_usage_output_tokens) ?? 0; + const cached = numberAttr(attrs.gen_ai_usage_cache_read_input_tokens); + if (cached !== undefined && cached > 0) { + acc.cached = (acc.cached ?? 0) + cached; + } + return acc; + }, + { input: 0, output: 0, cached: undefined as number | undefined }, + ); + + const traceSummary = buildDerivedTraceSummary({ + trace: { + event_count: + numberAttr(rootAttrs.agentv_trace_event_count) ?? + (toolSpans.length > 0 ? toolSpans.length : undefined), + tool_calls: countRawSpanNames( + toolSpans.map((span) => ({ + type: 'tool', + name: String(span.parsedAttributes.gen_ai_tool_name), + })), + ), + error_count: descendants.filter((span) => span.status?.code === 2).length || undefined, + llm_call_count: + numberAttr(rootAttrs.agentv_trace_llm_call_count) ?? + (llmSpans.length > 0 ? llmSpans.length : undefined), + }, + spans: [ + ...llmSpans.map((span) => ({ + type: 'llm' as const, + name: span.name ?? 'chat', + duration_ms: durationFromSpan(span), + })), + ...toolSpans.map((span) => ({ + type: 'tool' as const, + name: String(span.parsedAttributes.gen_ai_tool_name), + duration_ms: durationFromSpan(span), + })), + ], + duration_ms: numberAttr(rootAttrs.agentv_trace_duration_ms) ?? durationFromSpan(root), + cost_usd: numberAttr(rootAttrs.agentv_trace_cost_usd), + token_usage: + tokenUsage.input || + tokenUsage.output || + tokenUsage.cached || + numberAttr(rootAttrs.agentv_trace_token_input) || + numberAttr(rootAttrs.agentv_trace_token_output) || + numberAttr(rootAttrs.agentv_trace_token_cached) + ? { + input: tokenUsage.input || numberAttr(rootAttrs.agentv_trace_token_input) || 0, + output: tokenUsage.output || numberAttr(rootAttrs.agentv_trace_token_output) || 0, + ...(tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) + ? { + cached: + tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) || 0, + } + : {}), + } + : undefined, + }); + + const score = numberAttr(rootAttrs.agentv_score); + if (score === undefined) { + throw new Error( + `Unsupported OTLP trace root span at index ${index + 1}: missing agentv.score attribute`, + ); + } + + return { + test_id: + stringAttr(rootAttrs.agentv_test_id) ?? + stringAttr(rootAttrs.agentv_eval_id) ?? + `trace-${index + 1}`, + suite: stringAttr(rootAttrs.agentv_suite), + target: stringAttr(rootAttrs.agentv_target), + score, + error: root.status?.code === 2 ? root.status.message : undefined, + cost_usd: traceSummary?.cost_usd, + duration_ms: traceSummary?.duration_ms, + token_usage: traceSummary?.token_usage, + trace: traceSummary + ? { + event_count: traceSummary.event_count, + tool_calls: traceSummary.tool_calls, + error_count: traceSummary.error_count, + tool_durations: traceSummary.tool_durations, + llm_call_count: traceSummary.llm_call_count, + token_usage: traceSummary.token_usage, + cost_usd: traceSummary.cost_usd, + duration_ms: traceSummary.duration_ms, + } + : undefined, + spans: traceSummary?.spans, + output: stringAttr(rootAttrs.agentv_output_text), + scores: root.events + ?.filter( + (event) => + event.name?.startsWith('agentv.grader.') || event.name?.startsWith('agentv.evaluator.'), + ) + .map((event) => { + const attrs = parseOtlpAttributes(event.attributes); + const name = + event.name?.replace(/^agentv\.grader\./, '').replace(/^agentv\.evaluator\./, '') ?? + 'unknown'; + return { + name, + type: + stringAttr(attrs.agentv_grader_type) ?? + stringAttr(attrs.agentv_evaluator_type) ?? + 'unknown', + score: + numberAttr(attrs.agentv_grader_score) ?? + numberAttr(attrs.agentv_evaluator_score) ?? + 0, + }; + }), + } satisfies RawResult; + }); +} + +function isAgentvEvalRoot(span: OtlpSpan): boolean { + const attrs = parseOtlpAttributes(span.attributes); + return ( + span.name === 'agentv.eval' || + numberAttr(attrs.agentv_score) !== undefined || + typeof stringAttr(attrs.agentv_test_id) === 'string' + ); +} + +function collectChildSpans( + spanId: string | undefined, + childMap: Map, +): OtlpSpan[] { + if (!spanId) return []; + const direct = childMap.get(spanId) ?? []; + const all = [...direct]; + for (const child of direct) { + all.push(...collectChildSpans(child.spanId, childMap)); + } + return all; +} + +function parseOtlpAttributes(attributes: OtlpAttribute[] | undefined): Record { + const parsed: Record = {}; + for (const attribute of attributes ?? []) { + parsed[attribute.key.replace(/\./g, '_')] = parseOtlpValue(attribute.value); + } + return parsed; +} + +function parseOtlpValue(value: OtlpAttributeValue | undefined): unknown { + if (!value) return undefined; + if ('stringValue' in value && value.stringValue !== undefined) return value.stringValue; + if ('intValue' in value && value.intValue !== undefined) return Number(value.intValue); + if ('doubleValue' in value && value.doubleValue !== undefined) return value.doubleValue; + if ('boolValue' in value && value.boolValue !== undefined) return value.boolValue; + if ('arrayValue' in value) + return (value.arrayValue?.values ?? []).map((entry) => parseOtlpValue(entry)); + return undefined; +} + +function durationFromSpan( + span: Pick, +): number | undefined { + const start = Number(span.startTimeUnixNano); + const end = Number(span.endTimeUnixNano); + if (!Number.isFinite(start) || !Number.isFinite(end)) return undefined; + return Math.round((end - start) / 1_000_000); +} + +function stringAttr(value: unknown): string | undefined { + return typeof value === 'string' ? value : undefined; +} + +function numberAttr(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; +} + +interface DerivedTraceSummary extends RawTraceSummary { + spans?: RawTraceSpan[]; +} + +export function buildDerivedTraceSummary(result: { + trace?: RawTraceSummary; + spans?: RawTraceSpan[]; + token_usage?: RawResult['token_usage']; + cost_usd?: number; + duration_ms?: number; +}): DerivedTraceSummary | undefined { + const toolSpans = (result.spans ?? []).filter((span) => span.type === 'tool'); + const llmSpans = (result.spans ?? []).filter((span) => span.type === 'llm'); + const toolCalls = result.trace?.tool_calls ?? countRawSpanNames(toolSpans); + const toolDurations = result.trace?.tool_durations ?? groupRawSpanDurations(toolSpans); + const hasSpanData = (result.spans?.length ?? 0) > 0; + const eventCount = result.trace?.event_count ?? (hasSpanData ? toolSpans.length : undefined); + const llmCallCount = result.trace?.llm_call_count ?? (hasSpanData ? llmSpans.length : undefined); + + if ( + !result.trace && + !result.spans?.length && + result.token_usage === undefined && + result.cost_usd === undefined && + result.duration_ms === undefined + ) { + return undefined; + } + + return { + event_count: eventCount, + tool_calls: toolCalls, + error_count: result.trace?.error_count, + tool_durations: toolDurations, + llm_call_count: llmCallCount, + token_usage: result.trace?.token_usage ?? result.token_usage, + cost_usd: result.trace?.cost_usd ?? result.cost_usd, + duration_ms: result.trace?.duration_ms ?? result.duration_ms, + spans: result.spans, + }; +} + +function countRawSpanNames(spans: RawTraceSpan[]): Record | undefined { + const counts: Record = {}; + for (const span of spans) { + counts[span.name] = (counts[span.name] ?? 0) + 1; + } + return Object.keys(counts).length > 0 ? counts : undefined; +} + +function groupRawSpanDurations(spans: RawTraceSpan[]): Record | undefined { + const grouped: Record = {}; + for (const span of spans) { + if (span.duration_ms === undefined) continue; + const existing = grouped[span.name] ?? []; + existing.push(span.duration_ms); + grouped[span.name] = existing; + } + return Object.keys(grouped).length > 0 ? grouped : undefined; +} + +export function getTraceSummary(result: RawResult): RawTraceSummary | undefined { + const derived = buildDerivedTraceSummary(result); + if (!derived) return undefined; + const { spans: _spans, ...trace } = derived; + return trace; +} + +export function getTraceSpans(result: RawResult): RawTraceSpan[] { + return buildDerivedTraceSummary(result)?.spans ?? []; +} + +export function toTraceSummary(result: RawResult): TraceSummary | undefined { + const rawTrace = getTraceSummary(result); + if (!rawTrace) return undefined; + return toCamelCaseDeep(rawTrace) as TraceSummary; +} + +/** + * Metadata about a discovered run manifest for listing. + */ +export interface ResultFileMeta { + path: string; + filename: string; + timestamp: string; + testCount: number; + passRate: number; + avgScore: number; + sizeBytes: number; +} + +/** + * Enumerate canonical run manifests in `.agentv/results/runs/`. + */ +export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { + const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME); + + const files: { filePath: string; displayName: string }[] = []; + + try { + const entries = readdirSync(runsDir, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory()) { + continue; + } + + const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name)); + if (primaryPath) { + files.push({ filePath: primaryPath, displayName: entry.name }); + } + } + } catch { + // runs/ doesn't exist yet + } + + // Sort by display name descending (most recent first) + files.sort((a, b) => b.displayName.localeCompare(a.displayName)); + + const limited = limit !== undefined && limit > 0 ? files.slice(0, limit) : files; + + const metas: ResultFileMeta[] = []; + + for (const { filePath, displayName } of limited) { + try { + const fileStat = statSync(filePath); + const results = loadResultFile(filePath); + + const testCount = results.length; + const passCount = results.filter((r) => r.score >= PASS_THRESHOLD).length; + const passRate = testCount > 0 ? passCount / testCount : 0; + const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0; + + const filenameTimestamp = extractTimestampFromFilename(displayName); + const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? 'unknown'; + + metas.push({ + path: filePath, + filename: displayName, + timestamp, + testCount, + passRate, + avgScore, + sizeBytes: fileStat.size, + }); + } catch { + // Skip unreadable files + } + } + + return metas; +} + +/** + * Extract ISO timestamp from eval filename like eval_2026-02-20T21-38-05-833Z.jsonl + */ +export function extractTimestampFromFilename(filename: string): string | undefined { + const match = filename.match( + /(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/, + ); + if (!match) return undefined; + // Re-convert dashes back to colons/dots for display + return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ':$1:$2.$3Z'); +} + +/** + * Format a number with commas for display. + */ +export function formatNumber(n: number): string { + return n.toLocaleString(); +} + +/** + * Format duration in ms to human-readable. + */ +export function formatDuration(ms: number): string { + if (ms < 1000) return `${Math.round(ms)}ms`; + if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`; + const minutes = Math.floor(ms / 60000); + const seconds = ((ms % 60000) / 1000).toFixed(0); + return `${minutes}m${seconds}s`; +} + +/** + * Format cost in USD. + */ +export function formatCost(usd: number): string { + if (usd < 0.01) return `$${usd.toFixed(4)}`; + return `$${usd.toFixed(3)}`; +} + +/** + * Format file size for display. + */ +export function formatSize(bytes: number): string { + if (bytes < 1024) return `${bytes}B`; + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}KB`; + return `${(bytes / (1024 * 1024)).toFixed(1)}MB`; +} + +/** + * Format a score as percentage. + */ +export function formatScore(score: number): string { + return `${(score * 100).toFixed(0)}%`; +} diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 18cf70feb..29e26eb51 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -7,6 +7,7 @@ import { createCommand } from './commands/create/index.js'; import { evalCommand } from './commands/eval/index.js'; import { importCommand } from './commands/import/index.js'; import { initCmdTsCommand } from './commands/init/index.js'; +import { inspectCommand } from './commands/inspect/index.js'; import { pipelineCommand } from './commands/pipeline/index.js'; import { resultsCommand } from './commands/results/index.js'; import { resultsServeCommand } from './commands/results/serve.js'; @@ -35,7 +36,8 @@ export const app = subcommands({ self: selfCommand, serve: resultsServeCommand, studio: resultsServeCommand, - trace: traceCommand, + inspect: inspectCommand, + trace: traceCommand, // deprecated alias — use `inspect` instead trend: trendCommand, transpile: transpileCommand, trim: trimCommand, @@ -56,6 +58,7 @@ const EVAL_SUBCOMMANDS = new Set(['run', 'assert']); */ const TOP_LEVEL_COMMANDS = new Set([ 'import', + 'inspect', 'compare', 'convert', 'create', diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index f855e54fd..10e2dd380 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -742,7 +742,8 @@ export type ResolvedTarget = readonly config: VSCodeResolvedConfig; }) | (ResolvedTargetBase & { readonly kind: 'agentv'; readonly config: AgentVResolvedConfig }) - | (ResolvedTargetBase & { readonly kind: 'cli'; readonly config: CliResolvedConfig }); + | (ResolvedTargetBase & { readonly kind: 'cli'; readonly config: CliResolvedConfig }) + | (ResolvedTargetBase & { readonly kind: 'transcript'; readonly config: Record }); /** * Optional settings accepted on ALL target definitions regardless of provider. diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 970d254dd..d789daefd 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -31,7 +31,8 @@ export type ProviderKind = | 'mock' | 'vscode' | 'vscode-insiders' - | 'agentv'; + | 'agentv' + | 'transcript'; /** * Agent providers that spawn interactive sessions with filesystem access. @@ -78,6 +79,7 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'vscode', 'vscode-insiders', 'agentv', + 'transcript', ] as const; /** diff --git a/packages/core/src/import/codex-parser.ts b/packages/core/src/import/codex-parser.ts new file mode 100644 index 000000000..368452847 --- /dev/null +++ b/packages/core/src/import/codex-parser.ts @@ -0,0 +1,238 @@ +/** + * Codex CLI session JSONL parser. + * + * Reads a Codex CLI rollout transcript + * (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) and converts it to AgentV's + * Message[] format. + * + * Each line is a JSON object with one of these top-level types: + * session_meta → session metadata (id, cwd, cli_version, model) + * turn_context → per-turn context (model, cwd, turn_id) + * event_msg → events: task_started, task_complete, user_message, + * agent_message, token_count + * response_item → conversation items: message, function_call, + * function_call_output, reasoning, custom_tool_call, + * custom_tool_call_output + * + * Key behaviors: + * - response_item with type=message and role=user → user Message + * - response_item with type=message and role=assistant → assistant Message + * - response_item with type=function_call → ToolCall (pending output) + * - response_item with type=function_call_output → matched to pending call by call_id + * - response_item with type=reasoning → skipped (thinking tokens) + * - response_item with role=developer → skipped (system prompt) + * - session_meta → source metadata (session_id, cwd, version, model) + * - turn_context → model name extraction + * - Duration is from first↔last event timestamp + * - cost_usd is null (Codex CLI does not report per-session cost) + * - Token usage not available from rollout format (rate limit info only) + * + * To add a new response_item type: add a case to the switch in parseCodexSession(). + */ + +import type { Message, ToolCall } from '../evaluation/providers/types.js'; +import type { TranscriptEntry, TranscriptSource } from './types.js'; + +interface CodexLine { + readonly timestamp?: string; + readonly type: string; + readonly payload: Record; +} + +export function parseCodexSession(jsonl: string): TranscriptEntry { + const messages: Message[] = []; + let sessionId = ''; + let cwd: string | undefined; + let model: string | undefined; + let version: string | undefined; + let startTimestamp: string | undefined; + let endTimestamp: string | undefined; + + // Track pending function calls by call_id + const pendingCalls = new Map(); + + const lines = jsonl.split('\n').filter((l) => l.trim().length > 0); + + for (const line of lines) { + let entry: CodexLine; + try { + entry = JSON.parse(line) as CodexLine; + } catch { + continue; + } + + if (!entry.type) continue; + + // Track timestamps for duration + if (entry.timestamp) { + if (!startTimestamp) startTimestamp = entry.timestamp; + endTimestamp = entry.timestamp; + } + + const payload = entry.payload ?? {}; + + switch (entry.type) { + case 'session_meta': { + sessionId = String(payload.id ?? ''); + cwd = payload.cwd ? String(payload.cwd) : undefined; + version = payload.cli_version ? String(payload.cli_version) : undefined; + if (payload.model && !model) { + model = String(payload.model); + } + break; + } + + case 'turn_context': { + if (payload.model && !model) { + model = String(payload.model); + } + if (payload.cwd && !cwd) { + cwd = String(payload.cwd); + } + break; + } + + case 'response_item': { + const itemType = String(payload.type ?? ''); + const role = String(payload.role ?? ''); + + switch (itemType) { + case 'message': { + // Skip developer (system prompt) messages + if (role === 'developer') break; + + const content = extractResponseItemContent(payload.content); + if (role === 'user' && content) { + messages.push({ role: 'user', content }); + } else if (role === 'assistant' && content) { + messages.push({ role: 'assistant', content }); + } + break; + } + + case 'function_call': { + const toolName = String(payload.name ?? ''); + const callId = String(payload.call_id ?? ''); + let input: unknown; + if (typeof payload.arguments === 'string') { + try { + input = JSON.parse(payload.arguments); + } catch { + input = payload.arguments; + } + } else { + input = payload.arguments; + } + + const toolCall: ToolCall = { tool: toolName, input, id: callId }; + const msgIdx = messages.length; + messages.push({ + role: 'assistant', + toolCalls: [toolCall], + }); + + if (callId) { + pendingCalls.set(callId, { msgIdx, toolIdx: 0 }); + } + break; + } + + case 'custom_tool_call': { + const toolName = String(payload.name ?? ''); + const callId = String(payload.call_id ?? ''); + let input: unknown; + if (typeof payload.arguments === 'string') { + try { + input = JSON.parse(payload.arguments); + } catch { + input = payload.arguments; + } + } else { + input = payload.arguments; + } + + const toolCall: ToolCall = { tool: toolName, input, id: callId }; + const msgIdx = messages.length; + messages.push({ + role: 'assistant', + toolCalls: [toolCall], + }); + + if (callId) { + pendingCalls.set(callId, { msgIdx, toolIdx: 0 }); + } + break; + } + + case 'function_call_output': + case 'custom_tool_call_output': { + const callId = String(payload.call_id ?? ''); + const pending = pendingCalls.get(callId); + if (pending) { + const existingMsg = messages[pending.msgIdx]; + const existingCalls = [...(existingMsg.toolCalls ?? [])]; + existingCalls[pending.toolIdx] = { + ...existingCalls[pending.toolIdx], + output: payload.output, + }; + messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls }; + pendingCalls.delete(callId); + } + break; + } + + // Skip reasoning blocks (thinking tokens) + case 'reasoning': + break; + } + break; + } + + // Skip event_msg types (task_started, task_complete, token_count, etc.) + // They don't contain conversation content + } + } + + let durationMs: number | undefined; + if (startTimestamp && endTimestamp) { + durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime(); + } + + const source: TranscriptSource = { + provider: 'codex', + sessionId, + cwd, + startedAt: startTimestamp, + model, + version, + }; + + return { + messages, + source, + // Codex rollout files don't include token counts (only rate limit info) + tokenUsage: undefined, + durationMs, + costUsd: null, + }; +} + +/** + * Extract text content from a Codex response_item content array. + * Content is typically: [{ type: "input_text"|"output_text", text: "..." }] + */ +function extractResponseItemContent(content: unknown): string | undefined { + if (typeof content === 'string') return content; + if (!Array.isArray(content)) return undefined; + + const parts: string[] = []; + for (const block of content) { + if (typeof block === 'object' && block !== null) { + const b = block as Record; + if (typeof b.text === 'string') { + parts.push(b.text); + } + } + } + return parts.length > 0 ? parts.join('') : undefined; +} diff --git a/packages/core/src/import/codex-session-discovery.ts b/packages/core/src/import/codex-session-discovery.ts new file mode 100644 index 000000000..08a03a7a4 --- /dev/null +++ b/packages/core/src/import/codex-session-discovery.ts @@ -0,0 +1,113 @@ +/** + * Codex CLI session discovery. + * + * Scans ~/.codex/sessions/ for rollout JSONL files. Codex CLI stores sessions at: + * ~/.codex/sessions/YYYY/MM/DD/rollout--.jsonl + * + * Sessions are returned sorted by modification time (most recent first). + */ + +import { readdir, stat } from 'node:fs/promises'; +import { homedir } from 'node:os'; +import path from 'node:path'; + +export interface CodexSession { + /** UUID from the filename */ + readonly sessionId: string; + /** Full path to the JSONL file */ + readonly filePath: string; + /** Filename (e.g., rollout-2026-03-29T14-22-01-.jsonl) */ + readonly filename: string; + /** Last modification time */ + readonly updatedAt: Date; +} + +export interface CodexDiscoverOptions { + /** Filter by date string (YYYY-MM-DD). */ + readonly date?: string; + /** Maximum number of sessions to return (default: 10). */ + readonly limit?: number; + /** Override the default ~/.codex/sessions directory. */ + readonly sessionsDir?: string; + /** Return only the most recent session. */ + readonly latest?: boolean; +} + +const DEFAULT_SESSIONS_DIR = () => path.join(homedir(), '.codex', 'sessions'); + +export async function discoverCodexSessions(opts?: CodexDiscoverOptions): Promise { + const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR(); + const limit = opts?.latest ? 1 : (opts?.limit ?? 10); + + const sessions: CodexSession[] = []; + + // Walk YYYY/MM/DD directory structure + let yearDirs: string[]; + try { + yearDirs = await readdir(sessionsDir); + } catch { + return []; + } + + for (const year of yearDirs) { + const yearPath = path.join(sessionsDir, year); + let monthDirs: string[]; + try { + monthDirs = await readdir(yearPath); + } catch { + continue; + } + + for (const month of monthDirs) { + const monthPath = path.join(yearPath, month); + let dayDirs: string[]; + try { + dayDirs = await readdir(monthPath); + } catch { + continue; + } + + for (const day of dayDirs) { + // Filter by date if specified + if (opts?.date) { + const dirDate = `${year}-${month}-${day}`; + if (dirDate !== opts.date) continue; + } + + const dayPath = path.join(monthPath, day); + let files: string[]; + try { + files = await readdir(dayPath); + } catch { + continue; + } + + for (const file of files) { + if (!file.startsWith('rollout-') || !file.endsWith('.jsonl')) continue; + + const filePath = path.join(dayPath, file); + + // Extract UUID from filename: rollout--.jsonl + // UUID is the last segment before .jsonl + const nameWithoutExt = file.replace(/\.jsonl$/, ''); + const parts = nameWithoutExt.split('-'); + // UUID is typically the last 5 hyphen-separated segments (standard UUID format) + const sessionId = parts.length >= 6 ? parts.slice(-5).join('-') : nameWithoutExt; + + let updatedAt: Date; + try { + const fileStat = await stat(filePath); + updatedAt = fileStat.mtime; + } catch { + updatedAt = new Date(0); + } + + sessions.push({ sessionId, filePath, filename: file, updatedAt }); + } + } + } + } + + sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime()); + return sessions.slice(0, limit); +} diff --git a/packages/core/src/import/index.ts b/packages/core/src/import/index.ts index 7e695fd3b..664ef534a 100644 --- a/packages/core/src/import/index.ts +++ b/packages/core/src/import/index.ts @@ -1,7 +1,33 @@ export { parseClaudeSession } from './claude-parser.js'; +export { parseCodexSession } from './codex-parser.js'; +export { + discoverCodexSessions, + type CodexDiscoverOptions, + type CodexSession, +} from './codex-session-discovery.js'; export { discoverClaudeSessions, type ClaudeDiscoverOptions, type ClaudeSession, } from './session-discovery.js'; -export { readTranscriptFile, type TranscriptEntry, type TranscriptSource } from './types.js'; +export { TranscriptProvider } from './transcript-provider.js'; +export { + readTranscriptFile, + readTranscriptJsonl, + toTranscriptJsonLine, + type TranscriptEntry, + type TranscriptJsonLine, + type TranscriptSource, +} from './types.js'; + +// Re-export existing Copilot parser and discovery for the import pipeline +export { + parseCopilotEvents, + type ParsedCopilotSession, + type CopilotSessionMeta, +} from '../evaluation/providers/copilot-log-parser.js'; +export { + discoverCopilotSessions, + type CopilotSession, + type DiscoverOptions as CopilotDiscoverOptions, +} from '../evaluation/providers/copilot-session-discovery.js'; diff --git a/packages/core/src/import/transcript-provider.ts b/packages/core/src/import/transcript-provider.ts new file mode 100644 index 000000000..b1c43f85a --- /dev/null +++ b/packages/core/src/import/transcript-provider.ts @@ -0,0 +1,75 @@ +/** + * Transcript provider — replays pre-recorded session transcripts through the + * evaluation pipeline without invoking any live agent. + * + * Used by `agentv eval --transcript ` to grade imported sessions. + * + * How it works: + * 1. Reads a transcript JSONL file (produced by `agentv import`) + * 2. Each invocation pops the next line from the transcript + * 3. Returns a ProviderResponse with pre-populated output, token usage, etc. + * 4. Evaluators run identically to live eval — they see the same ProviderResponse + * + * The provider name in results is set to the source provider from the transcript + * (e.g., "claude", "codex", "copilot"). + */ + +import type { Provider, ProviderRequest, ProviderResponse } from '../evaluation/providers/types.js'; +import type { TranscriptJsonLine } from './types.js'; +import { readTranscriptJsonl } from './types.js'; + +export class TranscriptProvider implements Provider { + readonly id: string; + readonly kind = 'transcript' as const; + readonly targetName: string; + + private lines: TranscriptJsonLine[]; + private cursor = 0; + + constructor(targetName: string, lines: TranscriptJsonLine[]) { + this.targetName = targetName; + this.id = `transcript:${targetName}`; + this.lines = lines; + } + + /** + * Create a TranscriptProvider from a JSONL file path. + */ + static async fromFile(filePath: string): Promise { + const lines = await readTranscriptJsonl(filePath); + if (lines.length === 0) { + throw new Error(`Transcript file is empty: ${filePath}`); + } + const providerName = lines[0].source.provider ?? 'transcript'; + return new TranscriptProvider(providerName, lines); + } + + get lineCount(): number { + return this.lines.length; + } + + async invoke(_request: ProviderRequest): Promise { + if (this.cursor >= this.lines.length) { + throw new Error( + `Transcript exhausted: ${this.lines.length} line(s) available but ` + + `${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`, + ); + } + + const line = this.lines[this.cursor++]; + + return { + output: line.output, + tokenUsage: line.token_usage + ? { + input: line.token_usage.input, + output: line.token_usage.output, + cached: line.token_usage.cached, + } + : undefined, + durationMs: line.duration_ms, + costUsd: line.cost_usd ?? undefined, + startTime: line.source.timestamp, + }; + } +} diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts index 5595dfd82..109fa3f1c 100644 --- a/packages/core/src/import/types.ts +++ b/packages/core/src/import/types.ts @@ -1,11 +1,17 @@ /** * Core types for the transcript import pipeline. * - * A TranscriptEntry represents a single event in a parsed agent session - * transcript (user message, assistant response, tool call, etc.). + * A TranscriptEntry is the internal (camelCase) representation of a parsed + * session. A TranscriptJsonLine is the on-disk (snake_case) wire format + * written to .agentv/transcripts/*.jsonl files. * - * A TranscriptSource describes where a transcript came from (provider, - * session ID, file path, etc.). + * Flow: + * raw session JSONL → parser → TranscriptEntry (internal) + * TranscriptEntry → toTranscriptJsonLine() → JSONL on disk + * JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[] + * + * To add a new importer: write a parser that returns TranscriptEntry, + * then use toTranscriptJsonLine() to serialize. */ import { readFile } from 'node:fs/promises'; @@ -13,7 +19,7 @@ import { readFile } from 'node:fs/promises'; import type { Message, ProviderTokenUsage } from '../evaluation/providers/types.js'; /** - * A parsed transcript: ordered messages plus session metadata. + * A parsed transcript: ordered messages plus session metadata (internal camelCase). */ export interface TranscriptEntry { readonly messages: Message[]; @@ -24,7 +30,7 @@ export interface TranscriptEntry { } /** - * Metadata describing the origin of a transcript. + * Metadata describing the origin of a transcript (internal camelCase). */ export interface TranscriptSource { readonly provider: string; @@ -32,6 +38,79 @@ export interface TranscriptSource { readonly projectPath?: string; readonly startedAt?: string; readonly model?: string; + readonly version?: string; + readonly gitBranch?: string; + readonly cwd?: string; +} + +/** + * One line in a transcript JSONL file (snake_case wire format). + * + * Each line is a self-contained test case with pre-populated output. + * The `input` field is the first user message; the `output` field is the + * full conversation (Message[]). + */ +export interface TranscriptJsonLine { + readonly input: string; + readonly output: readonly Message[]; + readonly token_usage?: { + readonly input: number; + readonly output: number; + readonly cached?: number; + }; + readonly duration_ms?: number; + readonly cost_usd?: number | null; + readonly source: { + readonly provider: string; + readonly session_id: string; + readonly model?: string; + readonly timestamp?: string; + readonly git_branch?: string; + readonly cwd?: string; + readonly version?: string; + }; +} + +/** + * Convert a parsed TranscriptEntry to the on-disk JSONL wire format. + */ +export function toTranscriptJsonLine(entry: TranscriptEntry): TranscriptJsonLine { + const firstUserMessage = entry.messages.find((m) => m.role === 'user'); + const input = typeof firstUserMessage?.content === 'string' ? firstUserMessage.content : ''; + + return { + input, + output: entry.messages, + token_usage: entry.tokenUsage + ? { + input: entry.tokenUsage.input, + output: entry.tokenUsage.output, + cached: entry.tokenUsage.cached, + } + : undefined, + duration_ms: entry.durationMs, + cost_usd: entry.costUsd, + source: { + provider: entry.source.provider, + session_id: entry.source.sessionId, + model: entry.source.model, + timestamp: entry.source.startedAt, + git_branch: entry.source.gitBranch, + cwd: entry.source.cwd ?? entry.source.projectPath, + version: entry.source.version, + }, + }; +} + +/** + * Read a transcript JSONL file and parse each line into a TranscriptJsonLine. + */ +export async function readTranscriptJsonl(filePath: string): Promise { + const text = await readFile(filePath, 'utf8'); + return text + .split('\n') + .filter((line) => line.trim().length > 0) + .map((line) => JSON.parse(line) as TranscriptJsonLine); } /** From 6ab1c6b92338f420e11ca75d0dcfd76c126efb43 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 6 Apr 2026 02:00:17 +0000 Subject: [PATCH 2/4] fix: prevent transcript provider from being used as LLM grader When --transcript is used without --grader-target, the orchestrator's grader resolution would fall back to using the transcript provider as the grader, exhausting the transcript on the second invoke() call. Fix: return undefined from resolveGraderProvider when the target is a transcript provider so LLM-based evaluators skip gracefully. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/evaluation/orchestrator.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 68191dd11..49bec3ed0 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -391,6 +391,12 @@ export async function runEvaluation( // TODO: When --model is provided without --grader-target, override the model of // whichever grader target is resolved. For now, --model only works with --grader-target agentv. + // Transcript providers are passive replay — they cannot serve as LLM graders. + // Return undefined so LLM-based evaluators skip gracefully. + if (targetContext.kind === 'transcript') { + return undefined; + } + const graderName = targetContext.graderTarget ?? targetContext.name; const resolvedGrader = resolveTargetByName(graderName); if (!resolvedGrader) { From 48642825ef9d84cee5eac335b1a41023fe2b92f6 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 6 Apr 2026 02:15:00 +0000 Subject: [PATCH 3/4] refactor: use LLM_GRADER_CAPABLE_KINDS allowlist for grader resolution Replace the transcript-specific point check with a proper allowlist of provider kinds that can return structured JSON for LLM grading. Previously, resolveGraderProvider would blindly fall back to using the eval target as its own grader when no grader_target was configured. This silently broke for transcript, copilot-log, cli, and any other provider that can't produce grader responses. Now only providers in LLM_GRADER_CAPABLE_KINDS (openai, openrouter, azure, anthropic, gemini, agentv, mock) are used as fallback graders. All others return undefined, causing LLM-based evaluators to skip with a clear error rather than fail silently. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/evaluation/orchestrator.ts | 17 ++++++++++------- .../core/src/evaluation/providers/types.ts | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 49bec3ed0..c58a0c5b8 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -31,7 +31,11 @@ import type { ProviderStreamCallbacks, TargetDefinition, } from './providers/types.js'; -import { extractLastAssistantContent, isAgentProvider } from './providers/types.js'; +import { + LLM_GRADER_CAPABLE_KINDS, + extractLastAssistantContent, + isAgentProvider, +} from './providers/types.js'; import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './registry/index.js'; import { type TokenUsage, @@ -391,15 +395,14 @@ export async function runEvaluation( // TODO: When --model is provided without --grader-target, override the model of // whichever grader target is resolved. For now, --model only works with --grader-target agentv. - // Transcript providers are passive replay — they cannot serve as LLM graders. - // Return undefined so LLM-based evaluators skip gracefully. - if (targetContext.kind === 'transcript') { - return undefined; - } - const graderName = targetContext.graderTarget ?? targetContext.name; const resolvedGrader = resolveTargetByName(graderName); if (!resolvedGrader) { + // Only use the eval target as its own grader if it can return structured JSON. + // Agent providers, transcript, cli, and copilot-log cannot grade. + if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) { + return undefined; + } return getOrCreateProvider(targetContext); } return getOrCreateProvider(resolvedGrader); diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index d789daefd..b24833643 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -55,6 +55,24 @@ export const AGENT_PROVIDER_KINDS: readonly ProviderKind[] = [ 'vscode-insiders', ] as const; +/** + * Provider kinds that can return structured JSON for LLM grading. + * Used by the orchestrator to decide whether a target can double as its own + * grader when no explicit grader_target is configured. + * + * Providers NOT in this list (agent providers, transcript, cli, copilot-log) + * cannot produce grader responses and should not be used as graders. + */ +export const LLM_GRADER_CAPABLE_KINDS: readonly ProviderKind[] = [ + 'openai', + 'openrouter', + 'azure', + 'anthropic', + 'gemini', + 'agentv', + 'mock', +] as const; + /** * List of all supported provider kinds. * This is the source of truth for provider validation. From 6e27bfa1b80d3519fd2d90a9e4629984601a9287 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 6 Apr 2026 02:21:24 +0000 Subject: [PATCH 4/4] refactor: hard-remove agentv trace, replace with agentv inspect Delete the trace/ command directory entirely (no deprecated alias). Update all imports from trace/utils to inspect/utils. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/results/serve.ts | 2 +- apps/cli/src/commands/results/shared.ts | 2 +- apps/cli/src/commands/trace/index.ts | 17 - apps/cli/src/commands/trace/list.ts | 93 --- apps/cli/src/commands/trace/score.ts | 401 ------------- apps/cli/src/commands/trace/show.ts | 362 ------------ apps/cli/src/commands/trace/stats.ts | 261 --------- apps/cli/src/commands/trace/utils.ts | 646 --------------------- apps/cli/src/commands/trend/index.ts | 2 +- apps/cli/src/index.ts | 3 - apps/cli/test/commands/trace/trace.test.ts | 6 +- 11 files changed, 6 insertions(+), 1789 deletions(-) delete mode 100644 apps/cli/src/commands/trace/index.ts delete mode 100644 apps/cli/src/commands/trace/list.ts delete mode 100644 apps/cli/src/commands/trace/score.ts delete mode 100644 apps/cli/src/commands/trace/show.ts delete mode 100644 apps/cli/src/commands/trace/stats.ts delete mode 100644 apps/cli/src/commands/trace/utils.ts diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 92f2b20d5..47d503970 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -45,7 +45,7 @@ import { Hono } from 'hono'; import { parseJsonlResults } from '../eval/artifact-writer.js'; import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; -import { listResultFiles } from '../trace/utils.js'; +import { listResultFiles } from '../inspect/utils.js'; import { loadLightweightResults, loadManifestResults, diff --git a/apps/cli/src/commands/results/shared.ts b/apps/cli/src/commands/results/shared.ts index 874982266..0ba2b38d6 100644 --- a/apps/cli/src/commands/results/shared.ts +++ b/apps/cli/src/commands/results/shared.ts @@ -15,7 +15,7 @@ import { optional, positional, string } from 'cmd-ts'; import type { EvaluationResult } from '@agentv/core'; import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; -import { listResultFiles } from '../trace/utils.js'; +import { listResultFiles } from '../inspect/utils.js'; import { loadManifestResults, resolveResultSourcePath } from './manifest.js'; /** cmd-ts positional for optional result source file or workspace directory. */ diff --git a/apps/cli/src/commands/trace/index.ts b/apps/cli/src/commands/trace/index.ts deleted file mode 100644 index debc67672..000000000 --- a/apps/cli/src/commands/trace/index.ts +++ /dev/null @@ -1,17 +0,0 @@ -import { subcommands } from 'cmd-ts'; - -import { traceListCommand } from './list.js'; -import { traceScoreCommand } from './score.js'; -import { traceShowCommand } from './show.js'; -import { traceStatsCommand } from './stats.js'; - -export const traceCommand = subcommands({ - name: 'trace', - description: 'Inspect and analyze evaluation traces and results', - cmds: { - list: traceListCommand, - score: traceScoreCommand, - show: traceShowCommand, - stats: traceStatsCommand, - }, -}); diff --git a/apps/cli/src/commands/trace/list.ts b/apps/cli/src/commands/trace/list.ts deleted file mode 100644 index 42bea2b72..000000000 --- a/apps/cli/src/commands/trace/list.ts +++ /dev/null @@ -1,93 +0,0 @@ -import { command, number, oneOf, option, optional, string } from 'cmd-ts'; -import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; -import { - type ResultFileMeta, - c, - formatScore, - formatSize, - listResultFiles, - padLeft, - padRight, -} from './utils.js'; - -function formatListTable(metas: ResultFileMeta[]): string { - const lines: string[] = []; - - if (metas.length === 0) { - lines.push(`${c.yellow}No run workspaces found in .agentv/results/runs/${c.reset}`); - lines.push(`${c.dim}Run an evaluation first: agentv run ${c.reset}`); - return lines.join('\n'); - } - - lines.push(''); - lines.push(`${c.bold}Evaluation Runs${c.reset} ${c.dim}(.agentv/results/runs/)${c.reset}`); - lines.push(''); - - // Column widths - const maxFileLen = Math.max(4, ...metas.map((m) => m.filename.length)); - - // Header - const header = ` ${padRight('File', maxFileLen)} ${padLeft('Tests', 5)} ${padLeft('Pass', 5)} ${padLeft('Score', 6)} ${padLeft('Size', 7)} Timestamp`; - lines.push(`${c.dim}${header}${c.reset}`); - lines.push( - `${c.dim} ${'─'.repeat(maxFileLen)} ${'─'.repeat(5)} ${'─'.repeat(5)} ${'─'.repeat(6)} ${'─'.repeat(7)} ${'─'.repeat(24)}${c.reset}`, - ); - - for (const meta of metas) { - const passColor = meta.passRate >= 1.0 ? c.green : meta.passRate >= 0.5 ? c.yellow : c.red; - const scoreColor = meta.avgScore >= 0.9 ? c.green : meta.avgScore >= 0.5 ? c.yellow : c.red; - - const row = ` ${padRight(meta.filename, maxFileLen)} ${padLeft(String(meta.testCount), 5)} ${padLeft(`${passColor}${formatScore(meta.passRate)}${c.reset}`, 5)} ${padLeft(`${scoreColor}${formatScore(meta.avgScore)}${c.reset}`, 6)} ${padLeft(formatSize(meta.sizeBytes), 7)} ${c.dim}${meta.timestamp}${c.reset}`; - lines.push(row); - } - - lines.push(''); - lines.push( - `${c.dim}${metas.length} run workspace${metas.length !== 1 ? 's' : ''} found${c.reset}`, - ); - lines.push(''); - - return lines.join('\n'); -} - -export const traceListCommand = command({ - name: 'list', - description: 'List recent evaluation run workspaces from .agentv/results/runs/', - args: { - limit: option({ - type: optional(number), - long: 'limit', - short: 'n', - description: 'Maximum number of results to show (default: all)', - }), - format: option({ - type: optional(oneOf(['table', 'json'])), - long: 'format', - short: 'f', - description: 'Output format: table (default) or json', - }), - dir: option({ - type: optional(string), - long: 'dir', - short: 'd', - description: 'Working directory (default: current directory)', - }), - }, - handler: async ({ limit, format, dir }) => { - const cwd = dir ?? process.cwd(); - const outputFormat = format ?? 'table'; - - try { - const metas = listResultFiles(cwd, limit); - - if (outputFormat === 'json') { - console.log(JSON.stringify(toSnakeCaseDeep(metas), null, 2)); - } else { - console.log(formatListTable(metas)); - } - } catch (error) { - console.error(`Error: ${(error as Error).message}`); - process.exit(1); - } - }, -}); diff --git a/apps/cli/src/commands/trace/score.ts b/apps/cli/src/commands/trace/score.ts deleted file mode 100644 index da986096c..000000000 --- a/apps/cli/src/commands/trace/score.ts +++ /dev/null @@ -1,401 +0,0 @@ -import { - type EvalTest, - type EvaluationContext, - type EvaluationScore, - type Evaluator, - type EvaluatorConfig, - type EvaluatorDispatchContext, - type Message, - type Provider, - type ProviderRequest, - type ProviderResponse, - createBuiltinRegistry, - toCamelCaseDeep, -} from '@agentv/core'; -import { command, oneOf, option, optional, positional, string } from 'cmd-ts'; -import { - type RawResult, - c, - formatScore, - loadResultFile, - padLeft, - padRight, - toTraceSummary, -} from './utils.js'; - -/** - * Evaluator types that work without an LLM provider. - */ -const SUPPORTED_TYPES = [ - 'contains', - 'regex', - 'is-json', - 'equals', - 'latency', - 'cost', - 'token-usage', - 'execution-metrics', -] as const; - -/** - * Parse key=value pairs from a string like "max_tool_calls=10,max_tokens=2000" - */ -function parseKeyValues(s: string): Record { - const result: Record = {}; - if (!s) return result; - for (const pair of s.split(',')) { - const eqIdx = pair.indexOf('='); - if (eqIdx === -1) continue; - result[pair.slice(0, eqIdx).trim()] = pair.slice(eqIdx + 1).trim(); - } - return result; -} - -/** - * Parse an inline evaluator spec string into an EvaluatorConfig. - * - * Supported formats: - * contains:value - * regex:pattern - * is-json - * equals:value - * latency: - * cost: - * token-usage:max_total=N,max_input=N,max_output=N - * execution-metrics:max_tool_calls=N,max_tokens=N,max_llm_calls=N,... - */ -export function parseAssertSpec(spec: string): EvaluatorConfig { - const colonIdx = spec.indexOf(':'); - // Normalize snake_case to kebab-case for backward compat - const type = (colonIdx === -1 ? spec : spec.slice(0, colonIdx)).replace(/_/g, '-'); - const params = colonIdx === -1 ? '' : spec.slice(colonIdx + 1); - - switch (type) { - case 'contains': - if (!params) throw new Error('contains requires a value: contains:'); - return { name: 'contains', type: 'contains', value: params } as EvaluatorConfig; - - case 'regex': - if (!params) throw new Error('regex requires a pattern: regex:'); - return { name: 'regex', type: 'regex', value: params } as EvaluatorConfig; - - case 'is-json': - return { name: 'is-json', type: 'is-json' } as EvaluatorConfig; - - case 'equals': - if (!params) throw new Error('equals requires a value: equals:'); - return { name: 'equals', type: 'equals', value: params } as EvaluatorConfig; - - case 'latency': { - const threshold = Number(params); - if (!params || Number.isNaN(threshold)) - throw new Error('latency requires a threshold in ms: latency:'); - return { name: 'latency', type: 'latency', threshold } as EvaluatorConfig; - } - - case 'cost': { - const budget = Number(params); - if (!params || Number.isNaN(budget)) - throw new Error('cost requires a budget in USD: cost:'); - return { name: 'cost', type: 'cost', budget } as EvaluatorConfig; - } - - case 'token-usage': { - const kv = parseKeyValues(params); - const config: Record = { name: 'token-usage', type: 'token-usage' }; - if (kv.max_total) config.max_total = Number(kv.max_total); - if (kv.max_input) config.max_input = Number(kv.max_input); - if (kv.max_output) config.max_output = Number(kv.max_output); - return config as EvaluatorConfig; - } - - case 'execution-metrics': { - const kv = parseKeyValues(params); - const config: Record = { - name: 'execution-metrics', - type: 'execution-metrics', - }; - if (kv.max_tool_calls) config.max_tool_calls = Number(kv.max_tool_calls); - if (kv.max_llm_calls) config.max_llm_calls = Number(kv.max_llm_calls); - if (kv.max_tokens) config.max_tokens = Number(kv.max_tokens); - if (kv.max_cost_usd) config.max_cost_usd = Number(kv.max_cost_usd); - if (kv.max_duration_ms) config.max_duration_ms = Number(kv.max_duration_ms); - return config as EvaluatorConfig; - } - - default: - throw new Error( - `Unsupported evaluator type: "${type}". Supported: ${SUPPORTED_TYPES.join(', ')}`, - ); - } -} - -/** - * Extract candidate answer from a result record. - */ -function extractCandidate(raw: RawResult): string { - if (raw.output !== undefined) - return typeof raw.output === 'string' ? raw.output : JSON.stringify(raw.output); - return ''; -} - -/** - * Build a minimal EvalTest stub from a result record. - * Only used to satisfy the EvaluationContext interface — deterministic and - * trace-based evaluators don't access these fields. - */ -function buildTestCase(raw: RawResult): EvalTest { - return { - id: raw.test_id ?? 'unknown', - question: '', - input: [], - expected_output: [], - - file_paths: [], - criteria: '', - }; -} - -/** - * A no-op provider stub for evaluators that don't call LLM providers. - */ -const stubProvider: Provider = { - id: 'trace-score-stub', - kind: 'mock', - targetName: 'trace-score-stub', - invoke(_request: ProviderRequest): Promise { - throw new Error('trace score does not support LLM-based evaluators'); - }, -}; - -/** - * A no-op evaluator stub used as the required llmGrader in the dispatch context. - */ -const stubLlmGrader: Evaluator = { - kind: 'llm-grader', - evaluate(): EvaluationScore { - throw new Error('trace score does not support LLM-based evaluators'); - }, -}; - -interface ScoreResult { - testId: string; - candidate: string; - originalScore: number; - newScore: number; - verdict: string; - assertions: readonly { text: string; passed: boolean; evidence?: string }[]; -} - -async function runScore( - results: RawResult[], - evaluatorConfig: EvaluatorConfig, - testIdFilter?: string, -): Promise { - const registry = createBuiltinRegistry(); - - const dispatchContext: EvaluatorDispatchContext = { - llmGrader: stubLlmGrader, - registry, - }; - - const evaluator = await registry.create(evaluatorConfig, dispatchContext); - const scored: ScoreResult[] = []; - - for (const raw of results) { - if (testIdFilter && raw.test_id !== testIdFilter) continue; - - const trace = toTraceSummary(raw); - const candidate = extractCandidate(raw); - const output = raw.output as readonly Message[] | undefined; - - const evalContext: EvaluationContext = { - evalCase: buildTestCase(raw), - candidate, - target: { kind: 'custom' as const, name: raw.target ?? 'unknown', config: {} } as never, - provider: stubProvider, - attempt: 1, - promptInputs: { question: '' }, - now: new Date(), - output: Array.isArray(output) ? output : undefined, - trace, - tokenUsage: raw.token_usage - ? (toCamelCaseDeep(raw.token_usage) as EvaluationContext['tokenUsage']) - : undefined, - costUsd: raw.cost_usd, - durationMs: raw.duration_ms, - startTime: raw.start_time, - endTime: raw.end_time, - }; - - const score = await evaluator.evaluate(evalContext); - scored.push({ - testId: raw.test_id ?? 'unknown', - candidate: candidate.slice(0, 80), - originalScore: raw.score, - newScore: score.score, - verdict: score.verdict, - assertions: score.assertions, - }); - } - - return scored; -} - -function renderTable(scored: ScoreResult[], assertSpec: string): string { - const lines: string[] = []; - - // Header - const cols = [ - { header: 'Test ID', width: 24 }, - { header: 'Orig', width: 6 }, - { header: 'New', width: 6 }, - { header: 'Verdict', width: 8 }, - { header: 'Detail', width: 50 }, - ]; - - const headerLine = cols - .map((col) => padRight(`${c.bold}${col.header}${c.reset}`, col.width)) - .join(' '); - lines.push(headerLine); - lines.push(cols.map((col) => '─'.repeat(col.width)).join('──')); - - for (const r of scored) { - const verdictColor = r.verdict === 'pass' ? c.green : c.red; - const failed = r.assertions.filter((a) => !a.passed); - const passed = r.assertions.filter((a) => a.passed); - const detail = - failed.length > 0 - ? failed[0].text.slice(0, 48) - : passed.length > 0 - ? passed[0].text.slice(0, 48) - : ''; - - const row = [ - padRight(r.testId.slice(0, 24), cols[0].width), - padLeft(formatScore(r.originalScore), cols[1].width), - padLeft(`${verdictColor}${formatScore(r.newScore)}${c.reset}`, cols[2].width), - padRight(`${verdictColor}${r.verdict.toUpperCase()}${c.reset}`, cols[3].width), - detail.slice(0, cols[4].width), - ].join(' '); - lines.push(row); - } - - // Summary - const passCount = scored.filter((r) => r.verdict === 'pass').length; - const total = scored.length; - const meanScore = total > 0 ? scored.reduce((sum, r) => sum + r.newScore, 0) / total : 0; - lines.push(''); - lines.push( - `${c.bold}Assert:${c.reset} ${assertSpec} ${c.bold}Results:${c.reset} ${passCount}/${total} passed (${formatScore(passCount / (total || 1))}) ${c.bold}Mean:${c.reset} ${formatScore(meanScore)}`, - ); - - return lines.join('\n'); -} - -export const traceScoreCommand = command({ - name: 'score', - description: 'Run evaluators against existing trace sources post-hoc', - args: { - file: positional({ - type: string, - displayName: 'trace-source', - description: - 'Path to a run workspace, result manifest, simple trace JSONL, or OTLP JSON file', - }), - assert: option({ - type: string, - long: 'assert', - short: 'a', - description: - 'Evaluator spec: contains:, regex:, is-json, equals:, latency:, cost:, token-usage:, execution-metrics:', - }), - testId: option({ - type: optional(string), - long: 'test-id', - description: 'Filter to a specific test ID', - }), - format: option({ - type: optional(oneOf(['json', 'table'])), - long: 'format', - short: 'f', - description: 'Output format (default: table)', - }), - }, - handler: async ({ file, assert: assertSpec, testId, format }) => { - // Parse the evaluator spec - let evaluatorConfig: EvaluatorConfig; - try { - evaluatorConfig = parseAssertSpec(assertSpec); - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error(`${c.red}Error:${c.reset} ${msg}`); - process.exit(1); - } - - // Load results - let results: RawResult[]; - try { - results = loadResultFile(file); - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error(`${c.red}Error:${c.reset} Could not load result file: ${msg}`); - process.exit(1); - } - - if (results.length === 0) { - console.error(`${c.yellow}Warning:${c.reset} No results found in ${file}`); - process.exit(0); - } - - // Check for trace data if evaluator needs it - const traceRequired = ['latency', 'cost', 'token-usage', 'execution-metrics'].includes( - evaluatorConfig.type, - ); - if (traceRequired) { - const hasTrace = results.some( - (r) => - toTraceSummary(r) || - r.cost_usd !== undefined || - r.duration_ms !== undefined || - r.token_usage !== undefined, - ); - if (!hasTrace) { - console.error( - `${c.red}Error:${c.reset} Source lacks trace metrics. Use an OTLP trace export via ${c.bold}--otel-file${c.reset} or a run manifest with summary metrics in ${c.bold}index.jsonl${c.reset}.`, - ); - process.exit(1); - } - } - - // Run scoring - let scored: ScoreResult[]; - try { - scored = await runScore(results, evaluatorConfig, testId); - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error(`${c.red}Error:${c.reset} Scoring failed: ${msg}`); - process.exit(1); - } - - if (scored.length === 0) { - console.error( - `${c.yellow}Warning:${c.reset} No results matched${testId ? ` test ID "${testId}"` : ''}`, - ); - process.exit(0); - } - - // Output - if (format === 'json') { - console.log(JSON.stringify(scored, null, 2)); - } else { - console.log(renderTable(scored, assertSpec)); - } - - // Exit with non-zero if any failed - const hasFailures = scored.some((r) => r.verdict !== 'pass'); - if (hasFailures) { - process.exit(1); - } - }, -}); diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/trace/show.ts deleted file mode 100644 index 50e12f7e7..000000000 --- a/apps/cli/src/commands/trace/show.ts +++ /dev/null @@ -1,362 +0,0 @@ -import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts'; -import { - type RawResult, - c, - formatCost, - formatDuration, - formatNumber, - formatScore, - getTraceSpans, - getTraceSummary, - loadResultFile, -} from './utils.js'; - -/** - * Render flat trace summary line (fallback when full output messages not available). - */ -function renderFlatTrace(result: RawResult): string { - const trace = getTraceSummary(result); - const parts: string[] = []; - - if (trace?.tool_calls && Object.keys(trace.tool_calls).length > 0) { - const toolParts = Object.entries(trace.tool_calls).map(([name, count]) => { - return count > 1 ? `${name} ×${count}` : name; - }); - parts.push(`Tools: ${toolParts.join(', ')}`); - } - - if (result.duration_ms !== undefined) { - parts.push(`Duration: ${formatDuration(result.duration_ms)}`); - } - - if (result.token_usage) { - const total = result.token_usage.input + result.token_usage.output; - parts.push(`Tokens: ${formatNumber(total)}`); - } - - if (result.cost_usd !== undefined) { - parts.push(`Cost: ${formatCost(result.cost_usd)}`); - } - - if (trace?.llm_call_count !== undefined) { - parts.push(`LLM calls: ${trace.llm_call_count}`); - } - - return parts.join(' | '); -} - -/** - * Render per-evaluator scores inline. - */ -function renderScores(scores: { name: string; score: number; type: string }[]): string { - return scores - .map((s) => { - const scoreColor = s.score >= 0.9 ? c.green : s.score >= 0.5 ? c.yellow : c.red; - return `${s.name} ${scoreColor}${formatScore(s.score)}${c.reset}`; - }) - .join(' | '); -} - -// Raw output message shape (snake_case from JSONL) -interface RawMessage { - role?: string; - content?: unknown; - tool_calls?: RawToolCall[]; - start_time?: string; - end_time?: string; - duration_ms?: number; - token_usage?: { input: number; output: number; cached?: number }; -} - -interface RawToolCall { - tool: string; - input?: unknown; - output?: unknown; - start_time?: string; - end_time?: string; - duration_ms?: number; -} - -/** - * Render tree view from output messages. - * Shows a hierarchical trace: LLM calls → tool calls. - */ -function renderTree(result: RawResult): string { - const messages = result.output as RawMessage[] | undefined; - const spans = getTraceSpans(result); - - if (!messages || messages.length === 0) { - if (spans.length > 0) { - return renderSpanTree(result, spans); - } - // Fallback to flat summary - if ( - getTraceSummary(result) || - result.duration_ms !== undefined || - result.cost_usd !== undefined - ) { - return renderFlatTrace(result); - } - return `${c.dim}No trace data available${c.reset}`; - } - - const lines: string[] = []; - const testId = result.test_id ?? result.eval_id ?? 'unknown'; - - // Root node: test execution - const totalDuration = result.duration_ms; - const totalTokens = result.token_usage - ? result.token_usage.input + result.token_usage.output - : undefined; - const rootParts: string[] = [testId]; - if (totalDuration !== undefined) rootParts.push(formatDuration(totalDuration)); - if (totalTokens !== undefined) rootParts.push(`${formatNumber(totalTokens)} tok`); - if (result.cost_usd !== undefined) rootParts.push(formatCost(result.cost_usd)); - lines.push(`${c.bold}${rootParts.join(', ')}${c.reset}`); - - // Filter to meaningful messages (assistant with tool calls, or assistant responses) - const steps: { type: 'llm' | 'tools'; msg: RawMessage; index: number }[] = []; - for (let i = 0; i < messages.length; i++) { - const msg = messages[i]; - if (msg.role === 'assistant') { - if (msg.tool_calls && msg.tool_calls.length > 0) { - steps.push({ type: 'tools', msg, index: i }); - } else { - steps.push({ type: 'llm', msg, index: i }); - } - } - } - - for (let si = 0; si < steps.length; si++) { - const step = steps[si]; - const isLast = si === steps.length - 1; - const connector = isLast ? '└─' : '├─'; - const childPrefix = isLast ? ' ' : '│ '; - - if (step.type === 'llm') { - // LLM response without tool calls - const parts: string[] = [`${c.cyan}model${c.reset}`]; - if (step.msg.duration_ms !== undefined) parts.push(formatDuration(step.msg.duration_ms)); - if (step.msg.token_usage) { - const tok = step.msg.token_usage.input + step.msg.token_usage.output; - parts.push(`${formatNumber(tok)} tok`); - } - lines.push(`${connector} ${parts.join(', ')}`); - } else { - // Tool calls - const toolCalls = step.msg.tool_calls ?? []; - - if (toolCalls.length === 1) { - // Single tool call — inline - const tc = toolCalls[0]; - const parts: string[] = [`${c.yellow}${tc.tool}${c.reset}`]; - if (tc.duration_ms !== undefined) parts.push(formatDuration(tc.duration_ms)); - lines.push(`${connector} ${parts.join(', ')}`); - } else { - // Multiple tool calls — expand - const parts: string[] = [`${c.dim}tools${c.reset}`]; - if (step.msg.duration_ms !== undefined) parts.push(formatDuration(step.msg.duration_ms)); - lines.push(`${connector} ${parts.join(', ')}`); - - for (let ti = 0; ti < toolCalls.length; ti++) { - const tc = toolCalls[ti]; - const isLastTool = ti === toolCalls.length - 1; - const toolConnector = isLastTool ? '└─' : '├─'; - const tcParts: string[] = [`${c.yellow}${tc.tool}${c.reset}`]; - if (tc.duration_ms !== undefined) tcParts.push(formatDuration(tc.duration_ms)); - lines.push(`${childPrefix}${toolConnector} ${tcParts.join(', ')}`); - } - } - } - } - - // Scores line - if (result.scores && result.scores.length > 0) { - lines.push(''); - lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`); - } - - return lines.join('\n'); -} - -function renderSpanTree(result: RawResult, spans: ReturnType): string { - const lines: string[] = []; - const testId = result.test_id ?? result.eval_id ?? 'unknown'; - const totalTokens = result.token_usage - ? result.token_usage.input + result.token_usage.output - : undefined; - const rootParts: string[] = [testId]; - if (result.duration_ms !== undefined) rootParts.push(formatDuration(result.duration_ms)); - if (totalTokens !== undefined) rootParts.push(`${formatNumber(totalTokens)} tok`); - if (result.cost_usd !== undefined) rootParts.push(formatCost(result.cost_usd)); - lines.push(`${c.bold}${rootParts.join(', ')}${c.reset}`); - - spans.forEach((span, index) => { - const connector = index === spans.length - 1 ? '└─' : '├─'; - const color = span.type === 'llm' ? c.cyan : c.yellow; - const parts = [`${color}${span.name}${c.reset}`]; - if (span.duration_ms !== undefined) { - parts.push(formatDuration(span.duration_ms)); - } - lines.push(`${connector} ${parts.join(', ')}`); - }); - - if (result.scores && result.scores.length > 0) { - lines.push(''); - lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`); - } - - return lines.join('\n'); -} - -/** - * Format a single result for table display. - */ -function formatResultDetail(result: RawResult, index: number, tree: boolean): string { - const lines: string[] = []; - const testId = result.test_id ?? result.eval_id ?? `result-${index}`; - - if (tree) { - // Tree view - lines.push(renderTree(result)); - return lines.join('\n'); - } - - // Standard flat view - const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red; - lines.push( - `${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.suite ? ` ${c.dim}suite: ${result.suite}${c.reset}` : ''}`, - ); - - if (result.error) { - lines.push(` ${c.red}Error: ${result.error}${c.reset}`); - } - - if (result.assertions && result.assertions.length > 0) { - const passed = result.assertions.filter((a: { passed: boolean }) => a.passed); - const failed = result.assertions.filter((a: { passed: boolean }) => !a.passed); - if (passed.length > 0) - lines.push( - ` ${c.green}✓ Passed:${c.reset} ${passed.map((a: { text: string }) => a.text).join(', ')}`, - ); - if (failed.length > 0) - lines.push( - ` ${c.red}✗ Failed:${c.reset} ${failed.map((a: { text: string }) => a.text).join(', ')}`, - ); - } - - if (result.scores && result.scores.length > 0) { - lines.push(` ${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`); - } - - if (result.trace || result.duration_ms !== undefined || result.cost_usd !== undefined) { - lines.push(` ${c.dim}Trace:${c.reset} ${renderFlatTrace(result)}`); - } - - if (result.assertions && result.assertions.length > 0) { - const withEvidence = result.assertions.filter((a: { evidence?: string }) => a.evidence); - if (withEvidence.length > 0) { - const maxLen = 200; - const evidence = (withEvidence[0] as { evidence: string }).evidence; - const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence; - lines.push(` ${c.dim}Evidence: ${truncated}${c.reset}`); - } - } - - return lines.join('\n'); -} - -function formatShowTable( - results: RawResult[], - filePath: string, - testIdFilter?: string, - tree?: boolean, -): string { - const lines: string[] = []; - - let filtered = results; - if (testIdFilter) { - filtered = results.filter((r) => (r.test_id ?? r.eval_id) === testIdFilter); - if (filtered.length === 0) { - lines.push(`${c.yellow}No results found with test ID "${testIdFilter}"${c.reset}`); - lines.push(''); - lines.push(`${c.dim}Available test IDs:${c.reset}`); - for (const r of results) { - lines.push(` ${r.test_id ?? r.eval_id ?? '(unnamed)'}`); - } - return lines.join('\n'); - } - } - - lines.push(''); - lines.push(`${c.bold}Results:${c.reset} ${c.cyan}${filePath}${c.reset}`); - - const totalTests = filtered.length; - const passCount = filtered.filter((r) => r.score >= 1.0).length; - const failCount = totalTests - passCount; - const avgScore = totalTests > 0 ? filtered.reduce((sum, r) => sum + r.score, 0) / totalTests : 0; - - lines.push( - `${c.dim}${totalTests} test${totalTests !== 1 ? 's' : ''} | ${c.green}${passCount} passed${c.reset}${c.dim}${failCount > 0 ? ` | ${c.red}${failCount} failed${c.reset}${c.dim}` : ''} | avg score: ${formatScore(avgScore)}${c.reset}`, - ); - lines.push(''); - - for (let i = 0; i < filtered.length; i++) { - lines.push(formatResultDetail(filtered[i], i, tree ?? false)); - if (i < filtered.length - 1) { - lines.push(`${c.dim}${'─'.repeat(60)}${c.reset}`); - } - } - - lines.push(''); - return lines.join('\n'); -} - -export const traceShowCommand = command({ - name: 'show', - description: 'Show evaluation results with trace details from a result file', - args: { - file: positional({ - type: string, - displayName: 'trace-source', - description: - 'Path to a run workspace, result manifest, simple trace JSONL, or OTLP JSON file', - }), - testId: option({ - type: optional(string), - long: 'test-id', - description: 'Filter to a specific test ID', - }), - tree: flag({ - long: 'tree', - description: 'Show hierarchical trace tree from output messages or exported trace spans', - }), - format: option({ - type: optional(oneOf(['table', 'json'])), - long: 'format', - short: 'f', - description: 'Output format: table (default) or json', - }), - }, - handler: async ({ file, testId, tree, format }) => { - const outputFormat = format ?? 'table'; - - try { - const results = loadResultFile(file); - - let filtered = results; - if (testId) { - filtered = results.filter((r) => (r.test_id ?? r.eval_id) === testId); - } - - if (outputFormat === 'json') { - console.log(JSON.stringify(filtered, null, 2)); - } else { - console.log(formatShowTable(results, file, testId, tree)); - } - } catch (error) { - console.error(`Error: ${(error as Error).message}`); - process.exit(1); - } - }, -}); diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/trace/stats.ts deleted file mode 100644 index cf3df312c..000000000 --- a/apps/cli/src/commands/trace/stats.ts +++ /dev/null @@ -1,261 +0,0 @@ -import { command, oneOf, option, optional, positional, string } from 'cmd-ts'; -import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; -import { - type RawResult, - c, - formatCost, - formatNumber, - getTraceSummary, - loadResultFile, - padLeft, - padRight, -} from './utils.js'; - -/** - * Compute percentiles from a sorted array of numbers. - */ -export function percentile(sorted: number[], p: number): number { - if (sorted.length === 0) return 0; - const index = (p / 100) * (sorted.length - 1); - const lower = Math.floor(index); - const upper = Math.ceil(index); - if (lower === upper) return sorted[lower]; - return sorted[lower] + (sorted[upper] - sorted[lower]) * (index - lower); -} - -function mean(values: number[]): number { - if (values.length === 0) return 0; - return values.reduce((sum, v) => sum + v, 0) / values.length; -} - -interface MetricRow { - name: string; - values: number[]; - formatter: (n: number) => string; -} - -function collectMetrics(results: RawResult[]): MetricRow[] { - const rows: MetricRow[] = []; - - // Score - const scores = results.map((r) => r.score); - if (scores.length > 0) { - rows.push({ name: 'score', values: scores, formatter: (n) => n.toFixed(2) }); - } - - // Latency - const latencies = results.map((r) => r.duration_ms).filter((v): v is number => v !== undefined); - if (latencies.length > 0) { - rows.push({ - name: 'latency_s', - values: latencies.map((ms) => ms / 1000), - formatter: (n) => n.toFixed(1), - }); - } - - // Cost - const costs = results.map((r) => r.cost_usd).filter((v): v is number => v !== undefined); - if (costs.length > 0) { - rows.push({ name: 'cost_usd', values: costs, formatter: (n) => formatCost(n) }); - } - - // Total tokens - const tokens = results - .map((r) => { - if (!r.token_usage) return undefined; - return r.token_usage.input + r.token_usage.output; - }) - .filter((v): v is number => v !== undefined); - if (tokens.length > 0) { - rows.push({ - name: 'tokens_total', - values: tokens, - formatter: (n) => formatNumber(Math.round(n)), - }); - } - - // Tool calls - const toolCalls = results - .map((r) => getTraceSummary(r)?.event_count) - .filter((v): v is number => v !== undefined); - if (toolCalls.length > 0) { - rows.push({ name: 'tool_calls', values: toolCalls, formatter: (n) => String(Math.round(n)) }); - } - - // LLM calls - const llmCalls = results - .map((r) => getTraceSummary(r)?.llm_call_count) - .filter((v): v is number => v !== undefined); - if (llmCalls.length > 0) { - rows.push({ name: 'llm_calls', values: llmCalls, formatter: (n) => String(Math.round(n)) }); - } - - return rows; -} - -interface GroupedResults { - label: string; - results: RawResult[]; -} - -function groupResults(results: RawResult[], groupBy?: string): GroupedResults[] { - if (!groupBy) return [{ label: 'all', results }]; - - const groups = new Map(); - - for (const result of results) { - let key: string; - switch (groupBy) { - case 'target': - key = result.target ?? 'unknown'; - break; - case 'suite': - key = result.suite ?? 'unknown'; - break; - case 'test-id': - key = result.test_id ?? result.eval_id ?? 'unknown'; - break; - default: - key = 'all'; - } - if (!groups.has(key)) groups.set(key, []); - groups.get(key)?.push(result); - } - - return [...groups.entries()] - .sort(([a], [b]) => a.localeCompare(b)) - .map(([label, results]) => ({ label, results })); -} - -function formatStatsTable(groups: GroupedResults[], filePath: string): string { - const lines: string[] = []; - - lines.push(''); - lines.push(`${c.bold}Statistics:${c.reset} ${c.cyan}${filePath}${c.reset}`); - - for (const group of groups) { - if (groups.length > 1 || group.label !== 'all') { - lines.push(''); - lines.push( - `${c.bold}Group: ${group.label}${c.reset} ${c.dim}(${group.results.length} tests)${c.reset}`, - ); - } else { - lines.push(`${c.dim}${group.results.length} tests${c.reset}`); - } - lines.push(''); - - const metrics = collectMetrics(group.results); - - if (metrics.length === 0) { - lines.push(`${c.yellow}No trace metrics available${c.reset}`); - continue; - } - - // Column headers - const nameWidth = Math.max(12, ...metrics.map((m) => m.name.length)); - const colWidth = 10; - - const header = ` ${padRight('Metric', nameWidth)} ${padLeft('Mean', colWidth)} ${padLeft('P50', colWidth)} ${padLeft('P90', colWidth)} ${padLeft('P95', colWidth)} ${padLeft('P99', colWidth)}`; - lines.push(`${c.dim}${header}${c.reset}`); - lines.push( - `${c.dim} ${'─'.repeat(nameWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)} ${'─'.repeat(colWidth)}${c.reset}`, - ); - - for (const metric of metrics) { - const sorted = [...metric.values].sort((a, b) => a - b); - const row = ` ${padRight(metric.name, nameWidth)} ${padLeft(metric.formatter(mean(sorted)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 50)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 90)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 95)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 99)), colWidth)}`; - lines.push(row); - } - } - - lines.push(''); - return lines.join('\n'); -} - -interface StatsJson { - file: string; - groups: { - label: string; - count: number; - metrics: Record; - }[]; -} - -function computeStatsJson(groups: GroupedResults[], filePath: string): StatsJson { - return { - file: filePath, - groups: groups.map((group) => { - const metrics = collectMetrics(group.results); - const metricsObj: Record< - string, - { mean: number; p50: number; p90: number; p95: number; p99: number } - > = {}; - - for (const metric of metrics) { - const sorted = [...metric.values].sort((a, b) => a - b); - metricsObj[metric.name] = { - mean: Number(mean(sorted).toFixed(4)), - p50: Number(percentile(sorted, 50).toFixed(4)), - p90: Number(percentile(sorted, 90).toFixed(4)), - p95: Number(percentile(sorted, 95).toFixed(4)), - p99: Number(percentile(sorted, 99).toFixed(4)), - }; - } - - return { - label: group.label, - count: group.results.length, - metrics: metricsObj, - }; - }), - }; -} - -export const traceStatsCommand = command({ - name: 'stats', - description: 'Compute summary statistics (percentiles) across evaluation results', - args: { - file: positional({ - type: string, - displayName: 'trace-source', - description: - 'Path to a run workspace, result manifest, simple trace JSONL, or OTLP JSON file', - }), - groupBy: option({ - type: optional(oneOf(['target', 'eval-set', 'test-id'])), - long: 'group-by', - short: 'g', - description: 'Group statistics by: target, eval-set, or test-id', - }), - format: option({ - type: optional(oneOf(['table', 'json'])), - long: 'format', - short: 'f', - description: 'Output format: table (default) or json', - }), - }, - handler: async ({ file, groupBy, format }) => { - const outputFormat = format ?? 'table'; - - try { - const results = loadResultFile(file); - - if (results.length === 0) { - console.error('Error: Result file is empty'); - process.exit(1); - } - - const groups = groupResults(results, groupBy); - - if (outputFormat === 'json') { - const statsJson = computeStatsJson(groups, file); - console.log(JSON.stringify(toSnakeCaseDeep(statsJson), null, 2)); - } else { - console.log(formatStatsTable(groups, file)); - } - } catch (error) { - console.error(`Error: ${(error as Error).message}`); - process.exit(1); - } - }, -}); diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts deleted file mode 100644 index f10a97ab4..000000000 --- a/apps/cli/src/commands/trace/utils.ts +++ /dev/null @@ -1,646 +0,0 @@ -import { readFileSync, readdirSync, statSync } from 'node:fs'; -import path from 'node:path'; -import type { EvaluationResult, TraceSummary } from '@agentv/core'; -import { PASS_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; -import { - RESULT_INDEX_FILENAME, - RESULT_RUNS_DIRNAME, - resolveExistingRunPrimaryPath, - resolveWorkspaceOrFilePath, -} from '../eval/result-layout.js'; -import { loadManifestResults } from '../results/manifest.js'; - -// ANSI color codes (no dependency needed) -const colors = { - reset: '\x1b[0m', - bold: '\x1b[1m', - dim: '\x1b[2m', - green: '\x1b[32m', - red: '\x1b[31m', - yellow: '\x1b[33m', - cyan: '\x1b[36m', - gray: '\x1b[90m', -}; - -const noColor = process.env.NO_COLOR !== undefined || !process.stdout.isTTY; -export const c = noColor - ? (Object.fromEntries(Object.keys(colors).map((k) => [k, ''])) as typeof colors) - : colors; - -// Regex to strip ANSI escape codes -const ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, 'g'); - -export function stripAnsi(str: string): string { - return str.replace(ansiPattern, ''); -} - -export function padRight(str: string, len: number): string { - const plainLen = stripAnsi(str).length; - return str + ' '.repeat(Math.max(0, len - plainLen)); -} - -export function padLeft(str: string, len: number): string { - const plainLen = stripAnsi(str).length; - return ' '.repeat(Math.max(0, len - plainLen)) + str; -} - -/** - * A raw JSONL result record with snake_case keys as stored on disk. - */ -export interface RawResult { - timestamp?: string; - test_id?: string; - eval_id?: string; - suite?: string; - conversation_id?: string; - score: number; - assertions?: { text: string; passed: boolean; evidence?: string }[]; - target?: string; - error?: string; - scores?: RawEvaluatorScore[]; - trace?: RawTraceSummary; - // Promoted execution metrics (snake_case from JSONL) - token_usage?: { input: number; output: number; cached?: number }; - cost_usd?: number; - duration_ms?: number; - start_time?: string; - end_time?: string; - input?: unknown; - output?: unknown; - spans?: RawTraceSpan[]; - trials?: unknown[]; - aggregation?: unknown; - file_changes?: string; -} - -export interface RawEvaluatorScore { - name: string; - type: string; - score: number; - assertions?: { text: string; passed: boolean; evidence?: string }[]; - weight?: number; -} - -export interface RawTraceSummary { - event_count?: number; - tool_calls?: Record; - error_count?: number; - tool_durations?: Record; - llm_call_count?: number; - // Execution metrics (present when trace includes provider metrics) - token_usage?: { input: number; output: number; cached?: number }; - cost_usd?: number; - duration_ms?: number; -} - -export interface RawTraceSpan { - type?: 'tool' | 'llm' | string; - name: string; - duration_ms?: number; -} - -/** - * Load all result or trace records from a supported source. - * - * Supported sources: - * - Run workspace directories / index.jsonl manifests - * - Standalone trace JSONL files for trace-only workflows - * - OTLP JSON trace files written via --otel-file - */ -export function loadResultFile(filePath: string): RawResult[] { - const resolvedFilePath = resolveTraceResultPath(filePath); - - if (path.extname(resolvedFilePath) === '.json') { - return loadOtlpTraceFile(resolvedFilePath); - } - - if (path.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) { - return loadManifestAsRawResults(resolvedFilePath); - } - - return loadJsonlRecords(resolvedFilePath); -} - -function resolveTraceResultPath(filePath: string): string { - return resolveWorkspaceOrFilePath(filePath); -} - -function loadJsonlRecords(filePath: string): RawResult[] { - const content = readFileSync(filePath, 'utf8'); - const lines = content - .trim() - .split('\n') - .filter((line) => line.trim()); - - return lines.map((line, i) => { - const record = JSON.parse(line) as RawResult; - if (typeof record.score !== 'number') { - throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`); - } - return record; - }); -} - -function loadManifestAsRawResults(filePath: string): RawResult[] { - return loadManifestResults(filePath).map(toRawResult); -} - -function toRawResult(result: EvaluationResult): RawResult { - return { - timestamp: result.timestamp, - test_id: result.testId, - suite: result.suite, - conversation_id: result.conversationId, - score: result.score, - assertions: result.assertions?.map((assertion) => ({ - text: assertion.text, - passed: assertion.passed, - evidence: assertion.evidence, - })), - target: result.target, - error: result.error, - scores: result.scores?.map((score) => ({ - name: score.name, - type: score.type, - score: score.score, - assertions: score.assertions?.map((assertion) => ({ - text: assertion.text, - passed: assertion.passed, - evidence: assertion.evidence, - })), - weight: score.weight, - })), - token_usage: result.tokenUsage - ? { - input: result.tokenUsage.input, - output: result.tokenUsage.output, - cached: result.tokenUsage.cached, - } - : undefined, - cost_usd: result.costUsd, - duration_ms: result.durationMs, - start_time: result.startTime, - end_time: result.endTime, - input: result.input, - output: result.output, - file_changes: result.fileChanges, - }; -} - -type OtlpAttributeValue = - | { stringValue?: string; intValue?: number | string; doubleValue?: number; boolValue?: boolean } - | { arrayValue?: { values?: OtlpAttributeValue[] } }; - -interface OtlpAttribute { - key: string; - value: OtlpAttributeValue; -} - -interface OtlpEvent { - name?: string; - attributes?: OtlpAttribute[]; -} - -interface OtlpSpan { - traceId?: string; - spanId?: string; - parentSpanId?: string; - name?: string; - startTimeUnixNano?: string; - endTimeUnixNano?: string; - attributes?: OtlpAttribute[]; - status?: { code?: number; message?: string }; - events?: OtlpEvent[]; -} - -function loadOtlpTraceFile(filePath: string): RawResult[] { - const parsed = JSON.parse(readFileSync(filePath, 'utf8')) as { - resourceSpans?: { scopeSpans?: { spans?: OtlpSpan[] }[] }[]; - }; - - const spans = parsed.resourceSpans - ?.flatMap((resource) => resource.scopeSpans ?? []) - .flatMap((scope) => scope.spans ?? []); - - if (!spans || spans.length === 0) { - return []; - } - - const spanMap = new Map(); - const childMap = new Map(); - - for (const span of spans) { - if (!span.spanId) continue; - spanMap.set(span.spanId, span); - if (span.parentSpanId) { - const siblings = childMap.get(span.parentSpanId) ?? []; - siblings.push(span); - childMap.set(span.parentSpanId, siblings); - } - } - - const roots = spans.filter((span) => !span.parentSpanId || !spanMap.has(span.parentSpanId)); - const supportedRoots = roots.filter(isAgentvEvalRoot); - const candidateRoots = supportedRoots.length > 0 ? supportedRoots : roots; - - return candidateRoots.map((root, index) => { - const descendants = collectChildSpans(root.spanId, childMap); - const rootAttrs = parseOtlpAttributes(root.attributes); - const parsedDescendants = descendants.map((span) => ({ - ...span, - parsedAttributes: parseOtlpAttributes(span.attributes), - })); - const toolSpans = parsedDescendants.filter( - (span) => typeof span.parsedAttributes.gen_ai_tool_name === 'string', - ); - const llmSpans = parsedDescendants.filter( - (span) => - span.parsedAttributes.gen_ai_operation_name === 'chat' || - (typeof span.name === 'string' && span.name.startsWith('chat ')), - ); - const tokenUsage = descendants.reduce( - (acc, span) => { - const attrs = parseOtlpAttributes(span.attributes); - acc.input += numberAttr(attrs.gen_ai_usage_input_tokens) ?? 0; - acc.output += numberAttr(attrs.gen_ai_usage_output_tokens) ?? 0; - const cached = numberAttr(attrs.gen_ai_usage_cache_read_input_tokens); - if (cached !== undefined && cached > 0) { - acc.cached = (acc.cached ?? 0) + cached; - } - return acc; - }, - { input: 0, output: 0, cached: undefined as number | undefined }, - ); - - const traceSummary = buildDerivedTraceSummary({ - trace: { - event_count: - numberAttr(rootAttrs.agentv_trace_event_count) ?? - (toolSpans.length > 0 ? toolSpans.length : undefined), - tool_calls: countRawSpanNames( - toolSpans.map((span) => ({ - type: 'tool', - name: String(span.parsedAttributes.gen_ai_tool_name), - })), - ), - error_count: descendants.filter((span) => span.status?.code === 2).length || undefined, - llm_call_count: - numberAttr(rootAttrs.agentv_trace_llm_call_count) ?? - (llmSpans.length > 0 ? llmSpans.length : undefined), - }, - spans: [ - ...llmSpans.map((span) => ({ - type: 'llm' as const, - name: span.name ?? 'chat', - duration_ms: durationFromSpan(span), - })), - ...toolSpans.map((span) => ({ - type: 'tool' as const, - name: String(span.parsedAttributes.gen_ai_tool_name), - duration_ms: durationFromSpan(span), - })), - ], - duration_ms: numberAttr(rootAttrs.agentv_trace_duration_ms) ?? durationFromSpan(root), - cost_usd: numberAttr(rootAttrs.agentv_trace_cost_usd), - token_usage: - tokenUsage.input || - tokenUsage.output || - tokenUsage.cached || - numberAttr(rootAttrs.agentv_trace_token_input) || - numberAttr(rootAttrs.agentv_trace_token_output) || - numberAttr(rootAttrs.agentv_trace_token_cached) - ? { - input: tokenUsage.input || numberAttr(rootAttrs.agentv_trace_token_input) || 0, - output: tokenUsage.output || numberAttr(rootAttrs.agentv_trace_token_output) || 0, - ...(tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) - ? { - cached: - tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) || 0, - } - : {}), - } - : undefined, - }); - - const score = numberAttr(rootAttrs.agentv_score); - if (score === undefined) { - throw new Error( - `Unsupported OTLP trace root span at index ${index + 1}: missing agentv.score attribute`, - ); - } - - return { - test_id: - stringAttr(rootAttrs.agentv_test_id) ?? - stringAttr(rootAttrs.agentv_eval_id) ?? - `trace-${index + 1}`, - suite: stringAttr(rootAttrs.agentv_suite), - target: stringAttr(rootAttrs.agentv_target), - score, - error: root.status?.code === 2 ? root.status.message : undefined, - cost_usd: traceSummary?.cost_usd, - duration_ms: traceSummary?.duration_ms, - token_usage: traceSummary?.token_usage, - trace: traceSummary - ? { - event_count: traceSummary.event_count, - tool_calls: traceSummary.tool_calls, - error_count: traceSummary.error_count, - tool_durations: traceSummary.tool_durations, - llm_call_count: traceSummary.llm_call_count, - token_usage: traceSummary.token_usage, - cost_usd: traceSummary.cost_usd, - duration_ms: traceSummary.duration_ms, - } - : undefined, - spans: traceSummary?.spans, - output: stringAttr(rootAttrs.agentv_output_text), - scores: root.events - ?.filter( - (event) => - event.name?.startsWith('agentv.grader.') || event.name?.startsWith('agentv.evaluator.'), - ) - .map((event) => { - const attrs = parseOtlpAttributes(event.attributes); - const name = - event.name?.replace(/^agentv\.grader\./, '').replace(/^agentv\.evaluator\./, '') ?? - 'unknown'; - return { - name, - type: - stringAttr(attrs.agentv_grader_type) ?? - stringAttr(attrs.agentv_evaluator_type) ?? - 'unknown', - score: - numberAttr(attrs.agentv_grader_score) ?? - numberAttr(attrs.agentv_evaluator_score) ?? - 0, - }; - }), - } satisfies RawResult; - }); -} - -function isAgentvEvalRoot(span: OtlpSpan): boolean { - const attrs = parseOtlpAttributes(span.attributes); - return ( - span.name === 'agentv.eval' || - numberAttr(attrs.agentv_score) !== undefined || - typeof stringAttr(attrs.agentv_test_id) === 'string' - ); -} - -function collectChildSpans( - spanId: string | undefined, - childMap: Map, -): OtlpSpan[] { - if (!spanId) return []; - const direct = childMap.get(spanId) ?? []; - const all = [...direct]; - for (const child of direct) { - all.push(...collectChildSpans(child.spanId, childMap)); - } - return all; -} - -function parseOtlpAttributes(attributes: OtlpAttribute[] | undefined): Record { - const parsed: Record = {}; - for (const attribute of attributes ?? []) { - parsed[attribute.key.replace(/\./g, '_')] = parseOtlpValue(attribute.value); - } - return parsed; -} - -function parseOtlpValue(value: OtlpAttributeValue | undefined): unknown { - if (!value) return undefined; - if ('stringValue' in value && value.stringValue !== undefined) return value.stringValue; - if ('intValue' in value && value.intValue !== undefined) return Number(value.intValue); - if ('doubleValue' in value && value.doubleValue !== undefined) return value.doubleValue; - if ('boolValue' in value && value.boolValue !== undefined) return value.boolValue; - if ('arrayValue' in value) - return (value.arrayValue?.values ?? []).map((entry) => parseOtlpValue(entry)); - return undefined; -} - -function durationFromSpan( - span: Pick, -): number | undefined { - const start = Number(span.startTimeUnixNano); - const end = Number(span.endTimeUnixNano); - if (!Number.isFinite(start) || !Number.isFinite(end)) return undefined; - return Math.round((end - start) / 1_000_000); -} - -function stringAttr(value: unknown): string | undefined { - return typeof value === 'string' ? value : undefined; -} - -function numberAttr(value: unknown): number | undefined { - return typeof value === 'number' && Number.isFinite(value) ? value : undefined; -} - -interface DerivedTraceSummary extends RawTraceSummary { - spans?: RawTraceSpan[]; -} - -export function buildDerivedTraceSummary(result: { - trace?: RawTraceSummary; - spans?: RawTraceSpan[]; - token_usage?: RawResult['token_usage']; - cost_usd?: number; - duration_ms?: number; -}): DerivedTraceSummary | undefined { - const toolSpans = (result.spans ?? []).filter((span) => span.type === 'tool'); - const llmSpans = (result.spans ?? []).filter((span) => span.type === 'llm'); - const toolCalls = result.trace?.tool_calls ?? countRawSpanNames(toolSpans); - const toolDurations = result.trace?.tool_durations ?? groupRawSpanDurations(toolSpans); - const hasSpanData = (result.spans?.length ?? 0) > 0; - const eventCount = result.trace?.event_count ?? (hasSpanData ? toolSpans.length : undefined); - const llmCallCount = result.trace?.llm_call_count ?? (hasSpanData ? llmSpans.length : undefined); - - if ( - !result.trace && - !result.spans?.length && - result.token_usage === undefined && - result.cost_usd === undefined && - result.duration_ms === undefined - ) { - return undefined; - } - - return { - event_count: eventCount, - tool_calls: toolCalls, - error_count: result.trace?.error_count, - tool_durations: toolDurations, - llm_call_count: llmCallCount, - token_usage: result.trace?.token_usage ?? result.token_usage, - cost_usd: result.trace?.cost_usd ?? result.cost_usd, - duration_ms: result.trace?.duration_ms ?? result.duration_ms, - spans: result.spans, - }; -} - -function countRawSpanNames(spans: RawTraceSpan[]): Record | undefined { - const counts: Record = {}; - for (const span of spans) { - counts[span.name] = (counts[span.name] ?? 0) + 1; - } - return Object.keys(counts).length > 0 ? counts : undefined; -} - -function groupRawSpanDurations(spans: RawTraceSpan[]): Record | undefined { - const grouped: Record = {}; - for (const span of spans) { - if (span.duration_ms === undefined) continue; - const existing = grouped[span.name] ?? []; - existing.push(span.duration_ms); - grouped[span.name] = existing; - } - return Object.keys(grouped).length > 0 ? grouped : undefined; -} - -export function getTraceSummary(result: RawResult): RawTraceSummary | undefined { - const derived = buildDerivedTraceSummary(result); - if (!derived) return undefined; - const { spans: _spans, ...trace } = derived; - return trace; -} - -export function getTraceSpans(result: RawResult): RawTraceSpan[] { - return buildDerivedTraceSummary(result)?.spans ?? []; -} - -export function toTraceSummary(result: RawResult): TraceSummary | undefined { - const rawTrace = getTraceSummary(result); - if (!rawTrace) return undefined; - return toCamelCaseDeep(rawTrace) as TraceSummary; -} - -/** - * Metadata about a discovered run manifest for listing. - */ -export interface ResultFileMeta { - path: string; - filename: string; - timestamp: string; - testCount: number; - passRate: number; - avgScore: number; - sizeBytes: number; -} - -/** - * Enumerate canonical run manifests in `.agentv/results/runs/`. - */ -export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { - const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME); - - const files: { filePath: string; displayName: string }[] = []; - - try { - const entries = readdirSync(runsDir, { withFileTypes: true }); - for (const entry of entries) { - if (!entry.isDirectory()) { - continue; - } - - const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name)); - if (primaryPath) { - files.push({ filePath: primaryPath, displayName: entry.name }); - } - } - } catch { - // runs/ doesn't exist yet - } - - // Sort by display name descending (most recent first) - files.sort((a, b) => b.displayName.localeCompare(a.displayName)); - - const limited = limit !== undefined && limit > 0 ? files.slice(0, limit) : files; - - const metas: ResultFileMeta[] = []; - - for (const { filePath, displayName } of limited) { - try { - const fileStat = statSync(filePath); - const results = loadResultFile(filePath); - - const testCount = results.length; - const passCount = results.filter((r) => r.score >= PASS_THRESHOLD).length; - const passRate = testCount > 0 ? passCount / testCount : 0; - const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0; - - const filenameTimestamp = extractTimestampFromFilename(displayName); - const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? 'unknown'; - - metas.push({ - path: filePath, - filename: displayName, - timestamp, - testCount, - passRate, - avgScore, - sizeBytes: fileStat.size, - }); - } catch { - // Skip unreadable files - } - } - - return metas; -} - -/** - * Extract ISO timestamp from eval filename like eval_2026-02-20T21-38-05-833Z.jsonl - */ -export function extractTimestampFromFilename(filename: string): string | undefined { - const match = filename.match( - /(?:^|eval_)(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z)(?:\.jsonl)?$/, - ); - if (!match) return undefined; - // Re-convert dashes back to colons/dots for display - return match[1].replace(/-(\d{2})-(\d{2})-(\d{3})Z$/, ':$1:$2.$3Z'); -} - -/** - * Format a number with commas for display. - */ -export function formatNumber(n: number): string { - return n.toLocaleString(); -} - -/** - * Format duration in ms to human-readable. - */ -export function formatDuration(ms: number): string { - if (ms < 1000) return `${Math.round(ms)}ms`; - if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`; - const minutes = Math.floor(ms / 60000); - const seconds = ((ms % 60000) / 1000).toFixed(0); - return `${minutes}m${seconds}s`; -} - -/** - * Format cost in USD. - */ -export function formatCost(usd: number): string { - if (usd < 0.01) return `$${usd.toFixed(4)}`; - return `$${usd.toFixed(3)}`; -} - -/** - * Format file size for display. - */ -export function formatSize(bytes: number): string { - if (bytes < 1024) return `${bytes}B`; - if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}KB`; - return `${(bytes / (1024 * 1024)).toFixed(1)}MB`; -} - -/** - * Format a score as percentage. - */ -export function formatScore(score: number): string { - return `${(score * 100).toFixed(0)}%`; -} diff --git a/apps/cli/src/commands/trend/index.ts b/apps/cli/src/commands/trend/index.ts index 7ef5218ff..69078ff44 100644 --- a/apps/cli/src/commands/trend/index.ts +++ b/apps/cli/src/commands/trend/index.ts @@ -4,12 +4,12 @@ import { command, flag, number, oneOf, option, optional, restPositionals, string import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; +import { listResultFiles } from '../inspect/utils.js'; import { type LightweightResultRecord, loadLightweightResults, resolveResultSourcePath, } from '../results/manifest.js'; -import { listResultFiles } from '../trace/utils.js'; const colors = { reset: '\x1b[0m', diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 29e26eb51..78561b033 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -12,7 +12,6 @@ import { pipelineCommand } from './commands/pipeline/index.js'; import { resultsCommand } from './commands/results/index.js'; import { resultsServeCommand } from './commands/results/serve.js'; import { selfCommand } from './commands/self/index.js'; -import { traceCommand } from './commands/trace/index.js'; import { transpileCommand } from './commands/transpile/index.js'; import { trendCommand } from './commands/trend/index.js'; import { trimCommand } from './commands/trim/index.js'; @@ -37,7 +36,6 @@ export const app = subcommands({ serve: resultsServeCommand, studio: resultsServeCommand, inspect: inspectCommand, - trace: traceCommand, // deprecated alias — use `inspect` instead trend: trendCommand, transpile: transpileCommand, trim: trimCommand, @@ -68,7 +66,6 @@ const TOP_LEVEL_COMMANDS = new Set([ 'self', 'serve', 'studio', - 'trace', 'trend', 'transpile', 'trim', diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index 3f157b893..f49941f6b 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -3,14 +3,14 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { parseAssertSpec } from '../../../src/commands/trace/score.js'; -import { percentile } from '../../../src/commands/trace/stats.js'; +import { parseAssertSpec } from '../../../src/commands/inspect/score.js'; +import { percentile } from '../../../src/commands/inspect/stats.js'; import { extractTimestampFromFilename, formatDuration, listResultFiles, loadResultFile, -} from '../../../src/commands/trace/utils.js'; +} from '../../../src/commands/inspect/utils.js'; // Test JSONL content with trace data const RESULT_WITH_TRACE = JSON.stringify({