diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 38efa28ad..d46665abe 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -192,6 +192,12 @@ export const evalRunCommand = command({ long: 'exclude-tag', description: 'Skip eval files that have this tag (repeatable, file skipped if any match)', }), + transcript: option({ + type: optional(string), + long: 'transcript', + description: + 'Grade a pre-recorded transcript JSONL instead of invoking a live provider. Ignores targets.', + }), }, handler: async (args) => { // Launch interactive wizard when no eval paths and stdin is a TTY @@ -237,6 +243,7 @@ export const evalRunCommand = command({ threshold: args.threshold, tag: args.tag, excludeTag: args.excludeTag, + transcript: args.transcript, }; const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); if (result?.allExecutionErrors) { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 61aab86a6..5b316c6b2 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -90,6 +90,7 @@ interface NormalizedOptions { readonly threshold?: number; readonly tags: readonly string[]; readonly excludeTags: readonly string[]; + readonly transcript?: string; } function normalizeBoolean(value: unknown): boolean { @@ -357,6 +358,7 @@ function normalizeOptions( threshold: normalizeOptionalNumber(rawOptions.threshold), tags: normalizeStringArray(rawOptions.tag), excludeTags: normalizeStringArray(rawOptions.excludeTag), + transcript: normalizeString(rawOptions.transcript), } satisfies NormalizedOptions; } @@ -507,63 +509,86 @@ async function prepareFileMetadata(params: { category, }); const testIds = suite.tests.map((value) => value.id); - - // Determine target names: CLI --target flags override YAML - const cliTargets = options.cliTargets; const suiteTargets = suite.targets; - // Resolve which target names to use (precedence: CLI > suite YAML targets > default) - let targetNames: readonly string[]; - if (cliTargets.length > 0) { - targetNames = cliTargets; - } else if (suiteTargets && suiteTargets.length > 0) { - targetNames = suiteTargets; - } else { - targetNames = []; - } - let selections: { selection: TargetSelection; inlineTargetLabel: string }[]; - if (targetNames.length > 1) { - // Matrix mode: multiple targets - const multiSelections = await selectMultipleTargets({ - testFilePath, - repoRoot, - cwd, - explicitTargetsPath: options.targetsPath, - dryRun: options.dryRun, - dryRunDelay: options.dryRunDelay, - dryRunDelayMin: options.dryRunDelayMin, - dryRunDelayMax: options.dryRunDelayMax, - env: process.env, - targetNames, - }); - - selections = multiSelections.map((sel) => ({ - selection: sel, - inlineTargetLabel: sel.targetName, - })); - } else { - // Single target mode (legacy path) - const selection = await selectTarget({ - testFilePath, - repoRoot, - cwd, - explicitTargetsPath: options.targetsPath, - cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target, - dryRun: options.dryRun, - dryRunDelay: options.dryRunDelay, - dryRunDelayMin: options.dryRunDelayMin, - dryRunDelayMax: options.dryRunDelayMax, - env: process.env, - }); - + if (options.transcript) { + // --transcript mode: bypass target resolution entirely. + // Create a synthetic TargetSelection for the transcript provider. + const transcriptSelection: TargetSelection = { + definitions: [], + resolvedTarget: { + kind: 'transcript', + name: 'transcript', + config: {} as Record, + }, + targetName: 'transcript', + targetSource: 'cli', + targetsFilePath: options.transcript, + }; selections = [ { - selection, - inlineTargetLabel: selection.targetName, + selection: transcriptSelection, + inlineTargetLabel: `transcript (${path.basename(options.transcript)})`, }, ]; + } else { + // Determine target names: CLI --target flags override YAML + const cliTargets = options.cliTargets; + const suiteTargets = suite.targets; + + // Resolve which target names to use (precedence: CLI > suite YAML targets > default) + let targetNames: readonly string[]; + if (cliTargets.length > 0) { + targetNames = cliTargets; + } else if (suiteTargets && suiteTargets.length > 0) { + targetNames = suiteTargets; + } else { + targetNames = []; + } + + if (targetNames.length > 1) { + // Matrix mode: multiple targets + const multiSelections = await selectMultipleTargets({ + testFilePath, + repoRoot, + cwd, + explicitTargetsPath: options.targetsPath, + dryRun: options.dryRun, + dryRunDelay: options.dryRunDelay, + dryRunDelayMin: options.dryRunDelayMin, + dryRunDelayMax: options.dryRunDelayMax, + env: process.env, + targetNames, + }); + + selections = multiSelections.map((sel) => ({ + selection: sel, + inlineTargetLabel: sel.targetName, + })); + } else { + // Single target mode (legacy path) + const selection = await selectTarget({ + testFilePath, + repoRoot, + cwd, + explicitTargetsPath: options.targetsPath, + cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target, + dryRun: options.dryRun, + dryRunDelay: options.dryRunDelay, + dryRunDelayMin: options.dryRunDelayMin, + dryRunDelayMax: options.dryRunDelayMax, + env: process.env, + }); + + selections = [ + { + selection, + inlineTargetLabel: selection.targetName, + }, + ]; + } } return { @@ -623,6 +648,9 @@ async function runSingleEvalFile(params: { readonly totalBudgetUsd?: number; readonly failOnError?: FailOnError; readonly threshold?: number; + readonly providerFactory?: ( + target: import('@agentv/core').ResolvedTarget, + ) => import('@agentv/core').Provider; }): Promise<{ results: EvaluationResult[] }> { const { testFilePath, @@ -645,6 +673,7 @@ async function runSingleEvalFile(params: { matrixMode, totalBudgetUsd, failOnError, + providerFactory, } = params; const targetName = selection.targetName; @@ -742,6 +771,7 @@ async function runSingleEvalFile(params: { graderTarget: options.graderTarget, model: options.model, threshold: options.threshold, + providerFactory, streamCallbacks: streamingObserver?.getStreamCallbacks(), onResult: async (result: EvaluationResult) => { ( @@ -1198,6 +1228,31 @@ export async function runEvalCommand( // Use only files that survived tag filtering (fileMetadata keys) const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); + // --transcript: create a shared TranscriptProvider and validate line count + let transcriptProviderFactory: + | ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider) + | undefined; + if (options.transcript) { + const { TranscriptProvider } = await import('@agentv/core'); + const transcriptProvider = await TranscriptProvider.fromFile(options.transcript); + + // Validate: transcript lines must match total test cases across all files + const totalTests = [...fileMetadata.values()].reduce( + (sum, meta) => sum + meta.testCases.length, + 0, + ); + if (transcriptProvider.lineCount !== totalTests) { + throw new Error( + `Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`, + ); + } + + transcriptProviderFactory = () => transcriptProvider; + console.log( + `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`, + ); + } + try { await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => { const targetPrep = fileMetadata.get(testFilePath); @@ -1242,11 +1297,12 @@ export async function runEvalCommand( selection, inlineTargetLabel, testCases: applicableTestCases, - trialsConfig: targetPrep.trialsConfig, + trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, matrixMode: targetPrep.selections.length > 1, totalBudgetUsd: targetPrep.totalBudgetUsd, failOnError: targetPrep.failOnError, threshold: resolvedThreshold, + providerFactory: transcriptProviderFactory, }); return result.results; diff --git a/apps/cli/src/commands/import/claude.ts b/apps/cli/src/commands/import/claude.ts index 5664d1afe..4e75c2f1b 100644 --- a/apps/cli/src/commands/import/claude.ts +++ b/apps/cli/src/commands/import/claude.ts @@ -1,6 +1,11 @@ import { mkdir, writeFile } from 'node:fs/promises'; import path from 'node:path'; -import { discoverClaudeSessions, parseClaudeSession, readTranscriptFile } from '@agentv/core'; +import { + discoverClaudeSessions, + parseClaudeSession, + readTranscriptFile, + toTranscriptJsonLine, +} from '@agentv/core'; import { command, flag, option, optional, string } from 'cmd-ts'; export const importClaudeCommand = command({ @@ -106,9 +111,9 @@ export const importClaudeCommand = command({ // Ensure output directory exists await mkdir(path.dirname(outputPath), { recursive: true }); - // Write transcript as JSONL (one message per line) - const outputLines = transcript.messages.map((msg) => JSON.stringify(msg)); - await writeFile(outputPath, `${outputLines.join('\n')}\n`, 'utf8'); + // Write transcript as JSONL (one line per test case, snake_case wire format) + const jsonLine = toTranscriptJsonLine(transcript); + await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8'); const msgCount = transcript.messages.length; const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0); diff --git a/apps/cli/src/commands/import/codex.ts b/apps/cli/src/commands/import/codex.ts new file mode 100644 index 000000000..a99035b1d --- /dev/null +++ b/apps/cli/src/commands/import/codex.ts @@ -0,0 +1,127 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { + discoverCodexSessions, + parseCodexSession, + readTranscriptFile, + toTranscriptJsonLine, +} from '@agentv/core'; +import { command, flag, option, optional, string } from 'cmd-ts'; + +export const importCodexCommand = command({ + name: 'codex', + description: 'Import a Codex CLI session transcript for offline grading', + args: { + discover: option({ + type: optional(string), + long: 'discover', + description: 'Discovery mode: "latest" to import the most recent session', + }), + date: option({ + type: optional(string), + long: 'date', + description: 'Filter sessions by date (YYYY-MM-DD)', + }), + output: option({ + type: optional(string), + long: 'output', + short: 'o', + description: 'Output file path (default: .agentv/transcripts/codex-.jsonl)', + }), + sessionsDir: option({ + type: optional(string), + long: 'sessions-dir', + description: 'Override the default ~/.codex/sessions directory', + }), + list: flag({ + long: 'list', + description: 'List available sessions instead of importing', + }), + }, + handler: async ({ discover, date, output, sessionsDir, list }) => { + if (list) { + const sessions = await discoverCodexSessions({ + date, + sessionsDir, + limit: 20, + }); + + if (sessions.length === 0) { + console.log('No Codex CLI sessions found.'); + return; + } + + console.log(`Found ${sessions.length} session(s):\n`); + for (const session of sessions) { + const age = formatAge(session.updatedAt); + console.log(` ${session.sessionId} ${age} ${session.filename}`); + } + return; + } + + if (discover !== 'latest') { + console.error('Error: specify --discover latest to select a session.'); + process.exit(1); + } + + const sessions = await discoverCodexSessions({ + date, + sessionsDir, + latest: true, + }); + + if (sessions.length === 0) { + console.error('Error: no Codex CLI sessions found.'); + process.exit(1); + } + + const session = sessions[0]; + console.log(`Discovered latest session: ${session.filename}`); + + // Parse the session + const rawJsonl = await readTranscriptFile(session.filePath); + const transcript = parseCodexSession(rawJsonl); + + // Determine output path + const shortId = session.sessionId.slice(0, 8); + const outputPath = output ?? path.join('.agentv', 'transcripts', `codex-${shortId}.jsonl`); + + // Ensure output directory exists + await mkdir(path.dirname(outputPath), { recursive: true }); + + // Write transcript as JSONL (snake_case wire format) + const jsonLine = toTranscriptJsonLine(transcript); + await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8'); + + const msgCount = transcript.messages.length; + const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0); + + console.log(`Imported ${msgCount} messages (${toolCount} tool calls) → ${outputPath}`); + + if (transcript.source.model) { + console.log(` Model: ${transcript.source.model}`); + } + if (transcript.durationMs !== undefined) { + console.log(` Duration: ${formatDurationMs(transcript.durationMs)}`); + } + }, +}); + +function formatAge(date: Date): string { + const diffMs = Date.now() - date.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + if (diffMin < 60) return `${diffMin}m ago`; + const diffHours = Math.floor(diffMin / 60); + if (diffHours < 24) return `${diffHours}h ago`; + const diffDays = Math.floor(diffHours / 24); + return `${diffDays}d ago`; +} + +function formatDurationMs(ms: number): string { + if (ms < 1000) return `${ms}ms`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; +} diff --git a/apps/cli/src/commands/import/copilot.ts b/apps/cli/src/commands/import/copilot.ts new file mode 100644 index 000000000..dab154120 --- /dev/null +++ b/apps/cli/src/commands/import/copilot.ts @@ -0,0 +1,158 @@ +import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { discoverCopilotSessions, parseCopilotEvents, toTranscriptJsonLine } from '@agentv/core'; +import { command, flag, option, optional, string } from 'cmd-ts'; + +export const importCopilotCommand = command({ + name: 'copilot', + description: 'Import a Copilot CLI session transcript for offline grading', + args: { + sessionId: option({ + type: optional(string), + long: 'session-id', + description: 'UUID of the Copilot CLI session to import', + }), + discover: option({ + type: optional(string), + long: 'discover', + description: 'Discovery mode: "latest" to import the most recent session', + }), + output: option({ + type: optional(string), + long: 'output', + short: 'o', + description: + 'Output file path (default: .agentv/transcripts/copilot-.jsonl)', + }), + sessionStateDir: option({ + type: optional(string), + long: 'session-state-dir', + description: 'Override the default ~/.copilot/session-state directory', + }), + list: flag({ + long: 'list', + description: 'List available sessions instead of importing', + }), + }, + handler: async ({ sessionId, discover, output, sessionStateDir, list }) => { + if (list) { + const sessions = await discoverCopilotSessions({ + sessionStateDir, + limit: 20, + }); + + if (sessions.length === 0) { + console.log('No Copilot CLI sessions found.'); + return; + } + + console.log(`Found ${sessions.length} session(s):\n`); + for (const session of sessions) { + const age = formatAge(session.updatedAt); + const status = session.isActive ? ' (active)' : ''; + console.log(` ${session.sessionId} ${age} ${session.cwd}${status}`); + } + return; + } + + let sessionDir: string; + let resolvedSessionId: string; + + if (sessionId) { + const sessions = await discoverCopilotSessions({ + sessionStateDir, + limit: 100, + }); + const match = sessions.find((s: { sessionId: string }) => s.sessionId === sessionId); + if (!match) { + console.error(`Error: session ${sessionId} not found.`); + process.exit(1); + } + sessionDir = match.sessionDir; + resolvedSessionId = sessionId; + } else if (discover === 'latest') { + const sessions = await discoverCopilotSessions({ + sessionStateDir, + limit: 1, + }); + + if (sessions.length === 0) { + console.error('Error: no Copilot CLI sessions found.'); + process.exit(1); + } + sessionDir = sessions[0].sessionDir; + resolvedSessionId = sessions[0].sessionId; + console.log(`Discovered latest session: ${resolvedSessionId}`); + } else { + console.error('Error: specify --session-id or --discover latest to select a session.'); + process.exit(1); + } + + // Parse the session + const eventsPath = path.join(sessionDir, 'events.jsonl'); + const rawJsonl = await readFile(eventsPath, 'utf8'); + const parsed = parseCopilotEvents(rawJsonl); + + // Convert to TranscriptEntry format + const transcript = { + messages: parsed.messages, + source: { + provider: 'copilot' as const, + sessionId: resolvedSessionId, + cwd: parsed.meta.cwd, + startedAt: parsed.meta.startedAt, + model: parsed.meta.model, + }, + tokenUsage: parsed.tokenUsage, + durationMs: parsed.durationMs, + costUsd: null as number | null, + }; + + // Determine output path + const shortId = resolvedSessionId.slice(0, 8); + const outputPath = output ?? path.join('.agentv', 'transcripts', `copilot-${shortId}.jsonl`); + + // Ensure output directory exists + await mkdir(path.dirname(outputPath), { recursive: true }); + + // Write transcript as JSONL (snake_case wire format) + const jsonLine = toTranscriptJsonLine(transcript); + await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8'); + + const msgCount = transcript.messages.length; + const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0); + + console.log(`Imported ${msgCount} messages (${toolCount} tool calls) → ${outputPath}`); + + if (transcript.source.model) { + console.log(` Model: ${transcript.source.model}`); + } + if (transcript.durationMs !== undefined) { + console.log(` Duration: ${formatDurationMs(transcript.durationMs)}`); + } + if (transcript.tokenUsage) { + console.log( + ` Tokens: ${transcript.tokenUsage.input} in / ${transcript.tokenUsage.output} out`, + ); + } + }, +}); + +function formatAge(date: Date): string { + const diffMs = Date.now() - date.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + if (diffMin < 60) return `${diffMin}m ago`; + const diffHours = Math.floor(diffMin / 60); + if (diffHours < 24) return `${diffHours}h ago`; + const diffDays = Math.floor(diffHours / 24); + return `${diffDays}d ago`; +} + +function formatDurationMs(ms: number): string { + if (ms < 1000) return `${ms}ms`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; +} diff --git a/apps/cli/src/commands/import/index.ts b/apps/cli/src/commands/import/index.ts index d76ddcaf0..84435d4c0 100644 --- a/apps/cli/src/commands/import/index.ts +++ b/apps/cli/src/commands/import/index.ts @@ -1,11 +1,15 @@ import { subcommands } from 'cmd-ts'; import { importClaudeCommand } from './claude.js'; +import { importCodexCommand } from './codex.js'; +import { importCopilotCommand } from './copilot.js'; export const importCommand = subcommands({ name: 'import', description: 'Import agent session transcripts for offline grading', cmds: { claude: importClaudeCommand, + codex: importCodexCommand, + copilot: importCopilotCommand, }, }); diff --git a/apps/cli/src/commands/trace/index.ts b/apps/cli/src/commands/inspect/index.ts similarity index 74% rename from apps/cli/src/commands/trace/index.ts rename to apps/cli/src/commands/inspect/index.ts index debc67672..94d0a0b0e 100644 --- a/apps/cli/src/commands/trace/index.ts +++ b/apps/cli/src/commands/inspect/index.ts @@ -5,9 +5,9 @@ import { traceScoreCommand } from './score.js'; import { traceShowCommand } from './show.js'; import { traceStatsCommand } from './stats.js'; -export const traceCommand = subcommands({ - name: 'trace', - description: 'Inspect and analyze evaluation traces and results', +export const inspectCommand = subcommands({ + name: 'inspect', + description: 'Inspect and analyze evaluation results', cmds: { list: traceListCommand, score: traceScoreCommand, diff --git a/apps/cli/src/commands/trace/list.ts b/apps/cli/src/commands/inspect/list.ts similarity index 100% rename from apps/cli/src/commands/trace/list.ts rename to apps/cli/src/commands/inspect/list.ts diff --git a/apps/cli/src/commands/trace/score.ts b/apps/cli/src/commands/inspect/score.ts similarity index 100% rename from apps/cli/src/commands/trace/score.ts rename to apps/cli/src/commands/inspect/score.ts diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/inspect/show.ts similarity index 100% rename from apps/cli/src/commands/trace/show.ts rename to apps/cli/src/commands/inspect/show.ts diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/inspect/stats.ts similarity index 100% rename from apps/cli/src/commands/trace/stats.ts rename to apps/cli/src/commands/inspect/stats.ts diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/inspect/utils.ts similarity index 100% rename from apps/cli/src/commands/trace/utils.ts rename to apps/cli/src/commands/inspect/utils.ts diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 736174d6e..2ef89726d 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -45,7 +45,7 @@ import { Hono } from 'hono'; import { parseJsonlResults } from '../eval/artifact-writer.js'; import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; -import { listResultFiles } from '../trace/utils.js'; +import { listResultFiles } from '../inspect/utils.js'; import { registerEvalRoutes } from './eval-runner.js'; import { loadLightweightResults, diff --git a/apps/cli/src/commands/results/shared.ts b/apps/cli/src/commands/results/shared.ts index 874982266..0ba2b38d6 100644 --- a/apps/cli/src/commands/results/shared.ts +++ b/apps/cli/src/commands/results/shared.ts @@ -15,7 +15,7 @@ import { optional, positional, string } from 'cmd-ts'; import type { EvaluationResult } from '@agentv/core'; import { resolveRunManifestPath } from '../eval/result-layout.js'; import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js'; -import { listResultFiles } from '../trace/utils.js'; +import { listResultFiles } from '../inspect/utils.js'; import { loadManifestResults, resolveResultSourcePath } from './manifest.js'; /** cmd-ts positional for optional result source file or workspace directory. */ diff --git a/apps/cli/src/commands/trend/index.ts b/apps/cli/src/commands/trend/index.ts index 7ef5218ff..69078ff44 100644 --- a/apps/cli/src/commands/trend/index.ts +++ b/apps/cli/src/commands/trend/index.ts @@ -4,12 +4,12 @@ import { command, flag, number, oneOf, option, optional, restPositionals, string import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; +import { listResultFiles } from '../inspect/utils.js'; import { type LightweightResultRecord, loadLightweightResults, resolveResultSourcePath, } from '../results/manifest.js'; -import { listResultFiles } from '../trace/utils.js'; const colors = { reset: '\x1b[0m', diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 18cf70feb..78561b033 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -7,11 +7,11 @@ import { createCommand } from './commands/create/index.js'; import { evalCommand } from './commands/eval/index.js'; import { importCommand } from './commands/import/index.js'; import { initCmdTsCommand } from './commands/init/index.js'; +import { inspectCommand } from './commands/inspect/index.js'; import { pipelineCommand } from './commands/pipeline/index.js'; import { resultsCommand } from './commands/results/index.js'; import { resultsServeCommand } from './commands/results/serve.js'; import { selfCommand } from './commands/self/index.js'; -import { traceCommand } from './commands/trace/index.js'; import { transpileCommand } from './commands/transpile/index.js'; import { trendCommand } from './commands/trend/index.js'; import { trimCommand } from './commands/trim/index.js'; @@ -35,7 +35,7 @@ export const app = subcommands({ self: selfCommand, serve: resultsServeCommand, studio: resultsServeCommand, - trace: traceCommand, + inspect: inspectCommand, trend: trendCommand, transpile: transpileCommand, trim: trimCommand, @@ -56,6 +56,7 @@ const EVAL_SUBCOMMANDS = new Set(['run', 'assert']); */ const TOP_LEVEL_COMMANDS = new Set([ 'import', + 'inspect', 'compare', 'convert', 'create', @@ -65,7 +66,6 @@ const TOP_LEVEL_COMMANDS = new Set([ 'self', 'serve', 'studio', - 'trace', 'trend', 'transpile', 'trim', diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index 3f157b893..f49941f6b 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -3,14 +3,14 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { parseAssertSpec } from '../../../src/commands/trace/score.js'; -import { percentile } from '../../../src/commands/trace/stats.js'; +import { parseAssertSpec } from '../../../src/commands/inspect/score.js'; +import { percentile } from '../../../src/commands/inspect/stats.js'; import { extractTimestampFromFilename, formatDuration, listResultFiles, loadResultFile, -} from '../../../src/commands/trace/utils.js'; +} from '../../../src/commands/inspect/utils.js'; // Test JSONL content with trace data const RESULT_WITH_TRACE = JSON.stringify({ diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 68191dd11..c58a0c5b8 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -31,7 +31,11 @@ import type { ProviderStreamCallbacks, TargetDefinition, } from './providers/types.js'; -import { extractLastAssistantContent, isAgentProvider } from './providers/types.js'; +import { + LLM_GRADER_CAPABLE_KINDS, + extractLastAssistantContent, + isAgentProvider, +} from './providers/types.js'; import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './registry/index.js'; import { type TokenUsage, @@ -394,6 +398,11 @@ export async function runEvaluation( const graderName = targetContext.graderTarget ?? targetContext.name; const resolvedGrader = resolveTargetByName(graderName); if (!resolvedGrader) { + // Only use the eval target as its own grader if it can return structured JSON. + // Agent providers, transcript, cli, and copilot-log cannot grade. + if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) { + return undefined; + } return getOrCreateProvider(targetContext); } return getOrCreateProvider(resolvedGrader); diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index f855e54fd..10e2dd380 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -742,7 +742,8 @@ export type ResolvedTarget = readonly config: VSCodeResolvedConfig; }) | (ResolvedTargetBase & { readonly kind: 'agentv'; readonly config: AgentVResolvedConfig }) - | (ResolvedTargetBase & { readonly kind: 'cli'; readonly config: CliResolvedConfig }); + | (ResolvedTargetBase & { readonly kind: 'cli'; readonly config: CliResolvedConfig }) + | (ResolvedTargetBase & { readonly kind: 'transcript'; readonly config: Record }); /** * Optional settings accepted on ALL target definitions regardless of provider. diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 970d254dd..b24833643 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -31,7 +31,8 @@ export type ProviderKind = | 'mock' | 'vscode' | 'vscode-insiders' - | 'agentv'; + | 'agentv' + | 'transcript'; /** * Agent providers that spawn interactive sessions with filesystem access. @@ -54,6 +55,24 @@ export const AGENT_PROVIDER_KINDS: readonly ProviderKind[] = [ 'vscode-insiders', ] as const; +/** + * Provider kinds that can return structured JSON for LLM grading. + * Used by the orchestrator to decide whether a target can double as its own + * grader when no explicit grader_target is configured. + * + * Providers NOT in this list (agent providers, transcript, cli, copilot-log) + * cannot produce grader responses and should not be used as graders. + */ +export const LLM_GRADER_CAPABLE_KINDS: readonly ProviderKind[] = [ + 'openai', + 'openrouter', + 'azure', + 'anthropic', + 'gemini', + 'agentv', + 'mock', +] as const; + /** * List of all supported provider kinds. * This is the source of truth for provider validation. @@ -78,6 +97,7 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'vscode', 'vscode-insiders', 'agentv', + 'transcript', ] as const; /** diff --git a/packages/core/src/import/codex-parser.ts b/packages/core/src/import/codex-parser.ts new file mode 100644 index 000000000..368452847 --- /dev/null +++ b/packages/core/src/import/codex-parser.ts @@ -0,0 +1,238 @@ +/** + * Codex CLI session JSONL parser. + * + * Reads a Codex CLI rollout transcript + * (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) and converts it to AgentV's + * Message[] format. + * + * Each line is a JSON object with one of these top-level types: + * session_meta → session metadata (id, cwd, cli_version, model) + * turn_context → per-turn context (model, cwd, turn_id) + * event_msg → events: task_started, task_complete, user_message, + * agent_message, token_count + * response_item → conversation items: message, function_call, + * function_call_output, reasoning, custom_tool_call, + * custom_tool_call_output + * + * Key behaviors: + * - response_item with type=message and role=user → user Message + * - response_item with type=message and role=assistant → assistant Message + * - response_item with type=function_call → ToolCall (pending output) + * - response_item with type=function_call_output → matched to pending call by call_id + * - response_item with type=reasoning → skipped (thinking tokens) + * - response_item with role=developer → skipped (system prompt) + * - session_meta → source metadata (session_id, cwd, version, model) + * - turn_context → model name extraction + * - Duration is from first↔last event timestamp + * - cost_usd is null (Codex CLI does not report per-session cost) + * - Token usage not available from rollout format (rate limit info only) + * + * To add a new response_item type: add a case to the switch in parseCodexSession(). + */ + +import type { Message, ToolCall } from '../evaluation/providers/types.js'; +import type { TranscriptEntry, TranscriptSource } from './types.js'; + +interface CodexLine { + readonly timestamp?: string; + readonly type: string; + readonly payload: Record; +} + +export function parseCodexSession(jsonl: string): TranscriptEntry { + const messages: Message[] = []; + let sessionId = ''; + let cwd: string | undefined; + let model: string | undefined; + let version: string | undefined; + let startTimestamp: string | undefined; + let endTimestamp: string | undefined; + + // Track pending function calls by call_id + const pendingCalls = new Map(); + + const lines = jsonl.split('\n').filter((l) => l.trim().length > 0); + + for (const line of lines) { + let entry: CodexLine; + try { + entry = JSON.parse(line) as CodexLine; + } catch { + continue; + } + + if (!entry.type) continue; + + // Track timestamps for duration + if (entry.timestamp) { + if (!startTimestamp) startTimestamp = entry.timestamp; + endTimestamp = entry.timestamp; + } + + const payload = entry.payload ?? {}; + + switch (entry.type) { + case 'session_meta': { + sessionId = String(payload.id ?? ''); + cwd = payload.cwd ? String(payload.cwd) : undefined; + version = payload.cli_version ? String(payload.cli_version) : undefined; + if (payload.model && !model) { + model = String(payload.model); + } + break; + } + + case 'turn_context': { + if (payload.model && !model) { + model = String(payload.model); + } + if (payload.cwd && !cwd) { + cwd = String(payload.cwd); + } + break; + } + + case 'response_item': { + const itemType = String(payload.type ?? ''); + const role = String(payload.role ?? ''); + + switch (itemType) { + case 'message': { + // Skip developer (system prompt) messages + if (role === 'developer') break; + + const content = extractResponseItemContent(payload.content); + if (role === 'user' && content) { + messages.push({ role: 'user', content }); + } else if (role === 'assistant' && content) { + messages.push({ role: 'assistant', content }); + } + break; + } + + case 'function_call': { + const toolName = String(payload.name ?? ''); + const callId = String(payload.call_id ?? ''); + let input: unknown; + if (typeof payload.arguments === 'string') { + try { + input = JSON.parse(payload.arguments); + } catch { + input = payload.arguments; + } + } else { + input = payload.arguments; + } + + const toolCall: ToolCall = { tool: toolName, input, id: callId }; + const msgIdx = messages.length; + messages.push({ + role: 'assistant', + toolCalls: [toolCall], + }); + + if (callId) { + pendingCalls.set(callId, { msgIdx, toolIdx: 0 }); + } + break; + } + + case 'custom_tool_call': { + const toolName = String(payload.name ?? ''); + const callId = String(payload.call_id ?? ''); + let input: unknown; + if (typeof payload.arguments === 'string') { + try { + input = JSON.parse(payload.arguments); + } catch { + input = payload.arguments; + } + } else { + input = payload.arguments; + } + + const toolCall: ToolCall = { tool: toolName, input, id: callId }; + const msgIdx = messages.length; + messages.push({ + role: 'assistant', + toolCalls: [toolCall], + }); + + if (callId) { + pendingCalls.set(callId, { msgIdx, toolIdx: 0 }); + } + break; + } + + case 'function_call_output': + case 'custom_tool_call_output': { + const callId = String(payload.call_id ?? ''); + const pending = pendingCalls.get(callId); + if (pending) { + const existingMsg = messages[pending.msgIdx]; + const existingCalls = [...(existingMsg.toolCalls ?? [])]; + existingCalls[pending.toolIdx] = { + ...existingCalls[pending.toolIdx], + output: payload.output, + }; + messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls }; + pendingCalls.delete(callId); + } + break; + } + + // Skip reasoning blocks (thinking tokens) + case 'reasoning': + break; + } + break; + } + + // Skip event_msg types (task_started, task_complete, token_count, etc.) + // They don't contain conversation content + } + } + + let durationMs: number | undefined; + if (startTimestamp && endTimestamp) { + durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime(); + } + + const source: TranscriptSource = { + provider: 'codex', + sessionId, + cwd, + startedAt: startTimestamp, + model, + version, + }; + + return { + messages, + source, + // Codex rollout files don't include token counts (only rate limit info) + tokenUsage: undefined, + durationMs, + costUsd: null, + }; +} + +/** + * Extract text content from a Codex response_item content array. + * Content is typically: [{ type: "input_text"|"output_text", text: "..." }] + */ +function extractResponseItemContent(content: unknown): string | undefined { + if (typeof content === 'string') return content; + if (!Array.isArray(content)) return undefined; + + const parts: string[] = []; + for (const block of content) { + if (typeof block === 'object' && block !== null) { + const b = block as Record; + if (typeof b.text === 'string') { + parts.push(b.text); + } + } + } + return parts.length > 0 ? parts.join('') : undefined; +} diff --git a/packages/core/src/import/codex-session-discovery.ts b/packages/core/src/import/codex-session-discovery.ts new file mode 100644 index 000000000..08a03a7a4 --- /dev/null +++ b/packages/core/src/import/codex-session-discovery.ts @@ -0,0 +1,113 @@ +/** + * Codex CLI session discovery. + * + * Scans ~/.codex/sessions/ for rollout JSONL files. Codex CLI stores sessions at: + * ~/.codex/sessions/YYYY/MM/DD/rollout--.jsonl + * + * Sessions are returned sorted by modification time (most recent first). + */ + +import { readdir, stat } from 'node:fs/promises'; +import { homedir } from 'node:os'; +import path from 'node:path'; + +export interface CodexSession { + /** UUID from the filename */ + readonly sessionId: string; + /** Full path to the JSONL file */ + readonly filePath: string; + /** Filename (e.g., rollout-2026-03-29T14-22-01-.jsonl) */ + readonly filename: string; + /** Last modification time */ + readonly updatedAt: Date; +} + +export interface CodexDiscoverOptions { + /** Filter by date string (YYYY-MM-DD). */ + readonly date?: string; + /** Maximum number of sessions to return (default: 10). */ + readonly limit?: number; + /** Override the default ~/.codex/sessions directory. */ + readonly sessionsDir?: string; + /** Return only the most recent session. */ + readonly latest?: boolean; +} + +const DEFAULT_SESSIONS_DIR = () => path.join(homedir(), '.codex', 'sessions'); + +export async function discoverCodexSessions(opts?: CodexDiscoverOptions): Promise { + const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR(); + const limit = opts?.latest ? 1 : (opts?.limit ?? 10); + + const sessions: CodexSession[] = []; + + // Walk YYYY/MM/DD directory structure + let yearDirs: string[]; + try { + yearDirs = await readdir(sessionsDir); + } catch { + return []; + } + + for (const year of yearDirs) { + const yearPath = path.join(sessionsDir, year); + let monthDirs: string[]; + try { + monthDirs = await readdir(yearPath); + } catch { + continue; + } + + for (const month of monthDirs) { + const monthPath = path.join(yearPath, month); + let dayDirs: string[]; + try { + dayDirs = await readdir(monthPath); + } catch { + continue; + } + + for (const day of dayDirs) { + // Filter by date if specified + if (opts?.date) { + const dirDate = `${year}-${month}-${day}`; + if (dirDate !== opts.date) continue; + } + + const dayPath = path.join(monthPath, day); + let files: string[]; + try { + files = await readdir(dayPath); + } catch { + continue; + } + + for (const file of files) { + if (!file.startsWith('rollout-') || !file.endsWith('.jsonl')) continue; + + const filePath = path.join(dayPath, file); + + // Extract UUID from filename: rollout--.jsonl + // UUID is the last segment before .jsonl + const nameWithoutExt = file.replace(/\.jsonl$/, ''); + const parts = nameWithoutExt.split('-'); + // UUID is typically the last 5 hyphen-separated segments (standard UUID format) + const sessionId = parts.length >= 6 ? parts.slice(-5).join('-') : nameWithoutExt; + + let updatedAt: Date; + try { + const fileStat = await stat(filePath); + updatedAt = fileStat.mtime; + } catch { + updatedAt = new Date(0); + } + + sessions.push({ sessionId, filePath, filename: file, updatedAt }); + } + } + } + } + + sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime()); + return sessions.slice(0, limit); +} diff --git a/packages/core/src/import/index.ts b/packages/core/src/import/index.ts index 7e695fd3b..664ef534a 100644 --- a/packages/core/src/import/index.ts +++ b/packages/core/src/import/index.ts @@ -1,7 +1,33 @@ export { parseClaudeSession } from './claude-parser.js'; +export { parseCodexSession } from './codex-parser.js'; +export { + discoverCodexSessions, + type CodexDiscoverOptions, + type CodexSession, +} from './codex-session-discovery.js'; export { discoverClaudeSessions, type ClaudeDiscoverOptions, type ClaudeSession, } from './session-discovery.js'; -export { readTranscriptFile, type TranscriptEntry, type TranscriptSource } from './types.js'; +export { TranscriptProvider } from './transcript-provider.js'; +export { + readTranscriptFile, + readTranscriptJsonl, + toTranscriptJsonLine, + type TranscriptEntry, + type TranscriptJsonLine, + type TranscriptSource, +} from './types.js'; + +// Re-export existing Copilot parser and discovery for the import pipeline +export { + parseCopilotEvents, + type ParsedCopilotSession, + type CopilotSessionMeta, +} from '../evaluation/providers/copilot-log-parser.js'; +export { + discoverCopilotSessions, + type CopilotSession, + type DiscoverOptions as CopilotDiscoverOptions, +} from '../evaluation/providers/copilot-session-discovery.js'; diff --git a/packages/core/src/import/transcript-provider.ts b/packages/core/src/import/transcript-provider.ts new file mode 100644 index 000000000..b1c43f85a --- /dev/null +++ b/packages/core/src/import/transcript-provider.ts @@ -0,0 +1,75 @@ +/** + * Transcript provider — replays pre-recorded session transcripts through the + * evaluation pipeline without invoking any live agent. + * + * Used by `agentv eval --transcript ` to grade imported sessions. + * + * How it works: + * 1. Reads a transcript JSONL file (produced by `agentv import`) + * 2. Each invocation pops the next line from the transcript + * 3. Returns a ProviderResponse with pre-populated output, token usage, etc. + * 4. Evaluators run identically to live eval — they see the same ProviderResponse + * + * The provider name in results is set to the source provider from the transcript + * (e.g., "claude", "codex", "copilot"). + */ + +import type { Provider, ProviderRequest, ProviderResponse } from '../evaluation/providers/types.js'; +import type { TranscriptJsonLine } from './types.js'; +import { readTranscriptJsonl } from './types.js'; + +export class TranscriptProvider implements Provider { + readonly id: string; + readonly kind = 'transcript' as const; + readonly targetName: string; + + private lines: TranscriptJsonLine[]; + private cursor = 0; + + constructor(targetName: string, lines: TranscriptJsonLine[]) { + this.targetName = targetName; + this.id = `transcript:${targetName}`; + this.lines = lines; + } + + /** + * Create a TranscriptProvider from a JSONL file path. + */ + static async fromFile(filePath: string): Promise { + const lines = await readTranscriptJsonl(filePath); + if (lines.length === 0) { + throw new Error(`Transcript file is empty: ${filePath}`); + } + const providerName = lines[0].source.provider ?? 'transcript'; + return new TranscriptProvider(providerName, lines); + } + + get lineCount(): number { + return this.lines.length; + } + + async invoke(_request: ProviderRequest): Promise { + if (this.cursor >= this.lines.length) { + throw new Error( + `Transcript exhausted: ${this.lines.length} line(s) available but ` + + `${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`, + ); + } + + const line = this.lines[this.cursor++]; + + return { + output: line.output, + tokenUsage: line.token_usage + ? { + input: line.token_usage.input, + output: line.token_usage.output, + cached: line.token_usage.cached, + } + : undefined, + durationMs: line.duration_ms, + costUsd: line.cost_usd ?? undefined, + startTime: line.source.timestamp, + }; + } +} diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts index 5595dfd82..109fa3f1c 100644 --- a/packages/core/src/import/types.ts +++ b/packages/core/src/import/types.ts @@ -1,11 +1,17 @@ /** * Core types for the transcript import pipeline. * - * A TranscriptEntry represents a single event in a parsed agent session - * transcript (user message, assistant response, tool call, etc.). + * A TranscriptEntry is the internal (camelCase) representation of a parsed + * session. A TranscriptJsonLine is the on-disk (snake_case) wire format + * written to .agentv/transcripts/*.jsonl files. * - * A TranscriptSource describes where a transcript came from (provider, - * session ID, file path, etc.). + * Flow: + * raw session JSONL → parser → TranscriptEntry (internal) + * TranscriptEntry → toTranscriptJsonLine() → JSONL on disk + * JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[] + * + * To add a new importer: write a parser that returns TranscriptEntry, + * then use toTranscriptJsonLine() to serialize. */ import { readFile } from 'node:fs/promises'; @@ -13,7 +19,7 @@ import { readFile } from 'node:fs/promises'; import type { Message, ProviderTokenUsage } from '../evaluation/providers/types.js'; /** - * A parsed transcript: ordered messages plus session metadata. + * A parsed transcript: ordered messages plus session metadata (internal camelCase). */ export interface TranscriptEntry { readonly messages: Message[]; @@ -24,7 +30,7 @@ export interface TranscriptEntry { } /** - * Metadata describing the origin of a transcript. + * Metadata describing the origin of a transcript (internal camelCase). */ export interface TranscriptSource { readonly provider: string; @@ -32,6 +38,79 @@ export interface TranscriptSource { readonly projectPath?: string; readonly startedAt?: string; readonly model?: string; + readonly version?: string; + readonly gitBranch?: string; + readonly cwd?: string; +} + +/** + * One line in a transcript JSONL file (snake_case wire format). + * + * Each line is a self-contained test case with pre-populated output. + * The `input` field is the first user message; the `output` field is the + * full conversation (Message[]). + */ +export interface TranscriptJsonLine { + readonly input: string; + readonly output: readonly Message[]; + readonly token_usage?: { + readonly input: number; + readonly output: number; + readonly cached?: number; + }; + readonly duration_ms?: number; + readonly cost_usd?: number | null; + readonly source: { + readonly provider: string; + readonly session_id: string; + readonly model?: string; + readonly timestamp?: string; + readonly git_branch?: string; + readonly cwd?: string; + readonly version?: string; + }; +} + +/** + * Convert a parsed TranscriptEntry to the on-disk JSONL wire format. + */ +export function toTranscriptJsonLine(entry: TranscriptEntry): TranscriptJsonLine { + const firstUserMessage = entry.messages.find((m) => m.role === 'user'); + const input = typeof firstUserMessage?.content === 'string' ? firstUserMessage.content : ''; + + return { + input, + output: entry.messages, + token_usage: entry.tokenUsage + ? { + input: entry.tokenUsage.input, + output: entry.tokenUsage.output, + cached: entry.tokenUsage.cached, + } + : undefined, + duration_ms: entry.durationMs, + cost_usd: entry.costUsd, + source: { + provider: entry.source.provider, + session_id: entry.source.sessionId, + model: entry.source.model, + timestamp: entry.source.startedAt, + git_branch: entry.source.gitBranch, + cwd: entry.source.cwd ?? entry.source.projectPath, + version: entry.source.version, + }, + }; +} + +/** + * Read a transcript JSONL file and parse each line into a TranscriptJsonLine. + */ +export async function readTranscriptJsonl(filePath: string): Promise { + const text = await readFile(filePath, 'utf8'); + return text + .split('\n') + .filter((line) => line.trim().length > 0) + .map((line) => JSON.parse(line) as TranscriptJsonLine); } /**