Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@ export const evalRunCommand = command({
long: 'exclude-tag',
description: 'Skip eval files that have this tag (repeatable, file skipped if any match)',
}),
transcript: option({
type: optional(string),
long: 'transcript',
description:
'Grade a pre-recorded transcript JSONL instead of invoking a live provider. Ignores targets.',
}),
},
handler: async (args) => {
// Launch interactive wizard when no eval paths and stdin is a TTY
Expand Down Expand Up @@ -237,6 +243,7 @@ export const evalRunCommand = command({
threshold: args.threshold,
tag: args.tag,
excludeTag: args.excludeTag,
transcript: args.transcript,
};
const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
if (result?.allExecutionErrors) {
Expand Down
156 changes: 106 additions & 50 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ interface NormalizedOptions {
readonly threshold?: number;
readonly tags: readonly string[];
readonly excludeTags: readonly string[];
readonly transcript?: string;
}

function normalizeBoolean(value: unknown): boolean {
Expand Down Expand Up @@ -357,6 +358,7 @@ function normalizeOptions(
threshold: normalizeOptionalNumber(rawOptions.threshold),
tags: normalizeStringArray(rawOptions.tag),
excludeTags: normalizeStringArray(rawOptions.excludeTag),
transcript: normalizeString(rawOptions.transcript),
} satisfies NormalizedOptions;
}

Expand Down Expand Up @@ -507,63 +509,86 @@ async function prepareFileMetadata(params: {
category,
});
const testIds = suite.tests.map((value) => value.id);

// Determine target names: CLI --target flags override YAML
const cliTargets = options.cliTargets;
const suiteTargets = suite.targets;

// Resolve which target names to use (precedence: CLI > suite YAML targets > default)
let targetNames: readonly string[];
if (cliTargets.length > 0) {
targetNames = cliTargets;
} else if (suiteTargets && suiteTargets.length > 0) {
targetNames = suiteTargets;
} else {
targetNames = [];
}

let selections: { selection: TargetSelection; inlineTargetLabel: string }[];

if (targetNames.length > 1) {
// Matrix mode: multiple targets
const multiSelections = await selectMultipleTargets({
testFilePath,
repoRoot,
cwd,
explicitTargetsPath: options.targetsPath,
dryRun: options.dryRun,
dryRunDelay: options.dryRunDelay,
dryRunDelayMin: options.dryRunDelayMin,
dryRunDelayMax: options.dryRunDelayMax,
env: process.env,
targetNames,
});

selections = multiSelections.map((sel) => ({
selection: sel,
inlineTargetLabel: sel.targetName,
}));
} else {
// Single target mode (legacy path)
const selection = await selectTarget({
testFilePath,
repoRoot,
cwd,
explicitTargetsPath: options.targetsPath,
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
dryRun: options.dryRun,
dryRunDelay: options.dryRunDelay,
dryRunDelayMin: options.dryRunDelayMin,
dryRunDelayMax: options.dryRunDelayMax,
env: process.env,
});

if (options.transcript) {
// --transcript mode: bypass target resolution entirely.
// Create a synthetic TargetSelection for the transcript provider.
const transcriptSelection: TargetSelection = {
definitions: [],
resolvedTarget: {
kind: 'transcript',
name: 'transcript',
config: {} as Record<string, never>,
},
targetName: 'transcript',
targetSource: 'cli',
targetsFilePath: options.transcript,
};
selections = [
{
selection,
inlineTargetLabel: selection.targetName,
selection: transcriptSelection,
inlineTargetLabel: `transcript (${path.basename(options.transcript)})`,
},
];
} else {
// Determine target names: CLI --target flags override YAML
const cliTargets = options.cliTargets;
const suiteTargets = suite.targets;

// Resolve which target names to use (precedence: CLI > suite YAML targets > default)
let targetNames: readonly string[];
if (cliTargets.length > 0) {
targetNames = cliTargets;
} else if (suiteTargets && suiteTargets.length > 0) {
targetNames = suiteTargets;
} else {
targetNames = [];
}

if (targetNames.length > 1) {
// Matrix mode: multiple targets
const multiSelections = await selectMultipleTargets({
testFilePath,
repoRoot,
cwd,
explicitTargetsPath: options.targetsPath,
dryRun: options.dryRun,
dryRunDelay: options.dryRunDelay,
dryRunDelayMin: options.dryRunDelayMin,
dryRunDelayMax: options.dryRunDelayMax,
env: process.env,
targetNames,
});

selections = multiSelections.map((sel) => ({
selection: sel,
inlineTargetLabel: sel.targetName,
}));
} else {
// Single target mode (legacy path)
const selection = await selectTarget({
testFilePath,
repoRoot,
cwd,
explicitTargetsPath: options.targetsPath,
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
dryRun: options.dryRun,
dryRunDelay: options.dryRunDelay,
dryRunDelayMin: options.dryRunDelayMin,
dryRunDelayMax: options.dryRunDelayMax,
env: process.env,
});

selections = [
{
selection,
inlineTargetLabel: selection.targetName,
},
];
}
}

return {
Expand Down Expand Up @@ -623,6 +648,9 @@ async function runSingleEvalFile(params: {
readonly totalBudgetUsd?: number;
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly providerFactory?: (
target: import('@agentv/core').ResolvedTarget,
) => import('@agentv/core').Provider;
}): Promise<{ results: EvaluationResult[] }> {
const {
testFilePath,
Expand All @@ -645,6 +673,7 @@ async function runSingleEvalFile(params: {
matrixMode,
totalBudgetUsd,
failOnError,
providerFactory,
} = params;

const targetName = selection.targetName;
Expand Down Expand Up @@ -742,6 +771,7 @@ async function runSingleEvalFile(params: {
graderTarget: options.graderTarget,
model: options.model,
threshold: options.threshold,
providerFactory,
streamCallbacks: streamingObserver?.getStreamCallbacks(),
onResult: async (result: EvaluationResult) => {
(
Expand Down Expand Up @@ -1198,6 +1228,31 @@ export async function runEvalCommand(
// Use only files that survived tag filtering (fileMetadata keys)
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));

// --transcript: create a shared TranscriptProvider and validate line count
let transcriptProviderFactory:
| ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider)
| undefined;
if (options.transcript) {
const { TranscriptProvider } = await import('@agentv/core');
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);

// Validate: transcript lines must match total test cases across all files
const totalTests = [...fileMetadata.values()].reduce(
(sum, meta) => sum + meta.testCases.length,
0,
);
if (transcriptProvider.lineCount !== totalTests) {
throw new Error(
`Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`,
);
}

transcriptProviderFactory = () => transcriptProvider;
console.log(
`Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`,
);
}

try {
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
const targetPrep = fileMetadata.get(testFilePath);
Expand Down Expand Up @@ -1242,11 +1297,12 @@ export async function runEvalCommand(
selection,
inlineTargetLabel,
testCases: applicableTestCases,
trialsConfig: targetPrep.trialsConfig,
trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
matrixMode: targetPrep.selections.length > 1,
totalBudgetUsd: targetPrep.totalBudgetUsd,
failOnError: targetPrep.failOnError,
threshold: resolvedThreshold,
providerFactory: transcriptProviderFactory,
});

return result.results;
Expand Down
13 changes: 9 additions & 4 deletions apps/cli/src/commands/import/claude.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import { mkdir, writeFile } from 'node:fs/promises';
import path from 'node:path';
import { discoverClaudeSessions, parseClaudeSession, readTranscriptFile } from '@agentv/core';
import {
discoverClaudeSessions,
parseClaudeSession,
readTranscriptFile,
toTranscriptJsonLine,
} from '@agentv/core';
import { command, flag, option, optional, string } from 'cmd-ts';

export const importClaudeCommand = command({
Expand Down Expand Up @@ -106,9 +111,9 @@ export const importClaudeCommand = command({
// Ensure output directory exists
await mkdir(path.dirname(outputPath), { recursive: true });

// Write transcript as JSONL (one message per line)
const outputLines = transcript.messages.map((msg) => JSON.stringify(msg));
await writeFile(outputPath, `${outputLines.join('\n')}\n`, 'utf8');
// Write transcript as JSONL (one line per test case, snake_case wire format)
const jsonLine = toTranscriptJsonLine(transcript);
await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8');

const msgCount = transcript.messages.length;
const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0);
Expand Down
127 changes: 127 additions & 0 deletions apps/cli/src/commands/import/codex.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import { mkdir, writeFile } from 'node:fs/promises';
import path from 'node:path';
import {
discoverCodexSessions,
parseCodexSession,
readTranscriptFile,
toTranscriptJsonLine,
} from '@agentv/core';
import { command, flag, option, optional, string } from 'cmd-ts';

export const importCodexCommand = command({
name: 'codex',
description: 'Import a Codex CLI session transcript for offline grading',
args: {
discover: option({
type: optional(string),
long: 'discover',
description: 'Discovery mode: "latest" to import the most recent session',
}),
date: option({
type: optional(string),
long: 'date',
description: 'Filter sessions by date (YYYY-MM-DD)',
}),
output: option({
type: optional(string),
long: 'output',
short: 'o',
description: 'Output file path (default: .agentv/transcripts/codex-<timestamp>.jsonl)',
}),
sessionsDir: option({
type: optional(string),
long: 'sessions-dir',
description: 'Override the default ~/.codex/sessions directory',
}),
list: flag({
long: 'list',
description: 'List available sessions instead of importing',
}),
},
handler: async ({ discover, date, output, sessionsDir, list }) => {
if (list) {
const sessions = await discoverCodexSessions({
date,
sessionsDir,
limit: 20,
});

if (sessions.length === 0) {
console.log('No Codex CLI sessions found.');
return;
}

console.log(`Found ${sessions.length} session(s):\n`);
for (const session of sessions) {
const age = formatAge(session.updatedAt);
console.log(` ${session.sessionId} ${age} ${session.filename}`);
}
return;
}

if (discover !== 'latest') {
console.error('Error: specify --discover latest to select a session.');
process.exit(1);
}

const sessions = await discoverCodexSessions({
date,
sessionsDir,
latest: true,
});

if (sessions.length === 0) {
console.error('Error: no Codex CLI sessions found.');
process.exit(1);
}

const session = sessions[0];
console.log(`Discovered latest session: ${session.filename}`);

// Parse the session
const rawJsonl = await readTranscriptFile(session.filePath);
const transcript = parseCodexSession(rawJsonl);

// Determine output path
const shortId = session.sessionId.slice(0, 8);
const outputPath = output ?? path.join('.agentv', 'transcripts', `codex-${shortId}.jsonl`);

// Ensure output directory exists
await mkdir(path.dirname(outputPath), { recursive: true });

// Write transcript as JSONL (snake_case wire format)
const jsonLine = toTranscriptJsonLine(transcript);
await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8');

const msgCount = transcript.messages.length;
const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0);

console.log(`Imported ${msgCount} messages (${toolCount} tool calls) → ${outputPath}`);

if (transcript.source.model) {
console.log(` Model: ${transcript.source.model}`);
}
if (transcript.durationMs !== undefined) {
console.log(` Duration: ${formatDurationMs(transcript.durationMs)}`);
}
},
});

function formatAge(date: Date): string {
const diffMs = Date.now() - date.getTime();
const diffMin = Math.floor(diffMs / 60_000);
if (diffMin < 60) return `${diffMin}m ago`;
const diffHours = Math.floor(diffMin / 60);
if (diffHours < 24) return `${diffHours}h ago`;
const diffDays = Math.floor(diffHours / 24);
return `${diffDays}d ago`;
}

function formatDurationMs(ms: number): string {
if (ms < 1000) return `${ms}ms`;
const seconds = Math.floor(ms / 1000);
if (seconds < 60) return `${seconds}s`;
const minutes = Math.floor(seconds / 60);
const remainingSeconds = seconds % 60;
return `${minutes}m ${remainingSeconds}s`;
}
Loading
Loading