Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ export interface AggregateGradingArtifact {
export interface IndexArtifactEntry {
readonly timestamp: string;
readonly test_id: string;
readonly dataset?: string;
readonly suite?: string;
readonly category?: string;
readonly conversation_id?: string;
readonly score: number;
Expand Down Expand Up @@ -459,13 +459,13 @@ function safeTestId(testId: string | undefined): string {
return safeArtifactPathSegment(testId, 'unknown');
}

function getDataset(result: EvaluationResult): string | undefined {
return result.dataset;
function getSuite(result: EvaluationResult): string | undefined {
return result.suite;
}

function buildArtifactSubdir(result: EvaluationResult): string {
const segments = [];
const evalSet = getDataset(result);
const evalSet = getSuite(result);
if (evalSet) {
segments.push(safeArtifactPathSegment(evalSet, 'default'));
}
Expand Down Expand Up @@ -504,7 +504,7 @@ export function buildIndexArtifactEntry(
return {
timestamp: result.timestamp,
test_id: result.testId ?? 'unknown',
dataset: getDataset(result),
suite: getSuite(result),
category: result.category,
conversation_id: result.conversationId,
score: result.score,
Expand Down Expand Up @@ -536,7 +536,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA
return {
timestamp: result.timestamp,
test_id: result.testId ?? 'unknown',
dataset: getDataset(result),
suite: getSuite(result),
category: result.category,
conversation_id: result.conversationId,
score: result.score,
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/src/commands/eval/discover.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ export interface DiscoveredEvalFile {
* Discover eval files by glob pattern matching.
*
* Uses `eval_patterns` from `.agentv/config.yaml` if configured,
* otherwise falls back to default patterns that match `dataset*.yaml`
* and `eval.yaml` files under `evals/` directories.
* otherwise falls back to default patterns that match `suite*.yaml`,
* `eval.yaml`, and `dataset*.yaml` files under `evals/` directories.
*/
export async function discoverEvalFiles(cwd: string): Promise<readonly DiscoveredEvalFile[]> {
const repoRoot = await findRepoRoot(cwd);
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/junit-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export class JunitWriter {

const grouped = new Map<string, EvaluationResult[]>();
for (const result of this.results) {
const suite = result.dataset ?? 'default';
const suite = result.suite ?? 'default';
const existing = grouped.get(suite);
if (existing) {
existing.push(result);
Expand Down
46 changes: 23 additions & 23 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ async function prepareFileMetadata(params: {
readonly testCases: readonly EvalTest[];
readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[];
readonly trialsConfig?: TrialsConfig;
readonly datasetTargets?: readonly string[];
readonly suiteTargets?: readonly string[];
readonly yamlWorkers?: number;
readonly yamlCache?: boolean;
readonly yamlCachePath?: string;
Expand All @@ -501,23 +501,23 @@ async function prepareFileMetadata(params: {
const relativePath = path.relative(cwd, testFilePath);
const category = deriveCategory(relativePath);

const dataset = await loadTestSuite(testFilePath, repoRoot, {
const suite = await loadTestSuite(testFilePath, repoRoot, {
verbose: options.verbose,
filter: options.filter,
category,
});
const testIds = dataset.tests.map((value) => value.id);
const testIds = suite.tests.map((value) => value.id);

// Determine target names: CLI --target flags override YAML
const cliTargets = options.cliTargets;
const datasetTargets = dataset.targets;
const suiteTargets = suite.targets;

// Resolve which target names to use (precedence: CLI > dataset YAML targets > default)
// Resolve which target names to use (precedence: CLI > suite YAML targets > default)
let targetNames: readonly string[];
if (cliTargets.length > 0) {
targetNames = cliTargets;
} else if (datasetTargets && datasetTargets.length > 0) {
targetNames = datasetTargets;
} else if (suiteTargets && suiteTargets.length > 0) {
targetNames = suiteTargets;
} else {
targetNames = [];
}
Expand Down Expand Up @@ -568,17 +568,17 @@ async function prepareFileMetadata(params: {

return {
testIds,
testCases: dataset.tests,
testCases: suite.tests,
selections,
trialsConfig: dataset.trials,
datasetTargets,
yamlWorkers: dataset.workers,
yamlCache: dataset.cacheConfig?.enabled,
yamlCachePath: dataset.cacheConfig?.cachePath,
totalBudgetUsd: dataset.totalBudgetUsd,
failOnError: dataset.failOnError,
threshold: dataset.threshold,
tags: dataset.metadata?.tags,
trialsConfig: suite.trials,
suiteTargets,
yamlWorkers: suite.workers,
yamlCache: suite.cacheConfig?.enabled,
yamlCachePath: suite.cacheConfig?.cachePath,
totalBudgetUsd: suite.totalBudgetUsd,
failOnError: suite.failOnError,
threshold: suite.threshold,
tags: suite.metadata?.tags,
};
}

Expand Down Expand Up @@ -1021,7 +1021,7 @@ export async function runEvalCommand(
inlineTargetLabel: string;
}[];
readonly trialsConfig?: TrialsConfig;
readonly datasetTargets?: readonly string[];
readonly suiteTargets?: readonly string[];
readonly yamlWorkers?: number;
readonly yamlCache?: boolean;
readonly yamlCachePath?: string;
Expand Down Expand Up @@ -1104,7 +1104,7 @@ export async function runEvalCommand(
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
}

// Resolve dataset-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
// Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
const yamlThreshold = firstMeta?.threshold;
const resolvedThreshold = options.threshold ?? yamlThreshold;
if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
Expand All @@ -1128,13 +1128,13 @@ export async function runEvalCommand(
// In matrix mode, total eval count is tests × targets (accounting for per-test target overrides)
let totalEvalCount = 0;
for (const meta of fileMetadata.values()) {
const datasetTargetNames = meta.selections.map((s) => s.selection.targetName);
const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
for (const test of meta.testCases) {
// Per-test targets override dataset-level targets.
// Per-test targets override suite-level targets.
const testTargetNames =
test.targets && test.targets.length > 0
? test.targets.filter((t) => datasetTargetNames.includes(t))
: datasetTargetNames;
? test.targets.filter((t) => suiteTargetNames.includes(t))
: suiteTargetNames;
totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
}
}
Expand Down
8 changes: 4 additions & 4 deletions apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ export const evalBenchCommand = command({
const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
const testIds: string[] = manifest.test_ids;
const targetName: string = manifest.target?.name ?? 'unknown';
const datasetName: string = manifest.dataset ?? '';
const suiteName: string = manifest.suite ?? '';
const experiment: string | undefined = manifest.experiment;
const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';

const indexLines: string[] = [];
const allPassRates: number[] = [];

for (const testId of testIds) {
const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
const testDir = join(exportDir, ...subpath);
const artifactSubdir = subpath.join('/');
const evaluators: EvaluatorScore[] = [];
Expand Down Expand Up @@ -177,7 +177,7 @@ export const evalBenchCommand = command({
JSON.stringify({
timestamp: manifest.timestamp,
test_id: testId,
dataset: datasetName || undefined,
suite: suiteName || undefined,
experiment: experiment || undefined,
score: Math.round(weightedScore * 1000) / 1000,
target: targetName,
Expand Down
8 changes: 4 additions & 4 deletions apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* Progress is printed to stderr so users see real-time feedback.
*
* Export directory additions:
* <out-dir>/<dataset>/<test-id>/code_grader_results/<name>.json
* <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
*/
import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';
Expand Down Expand Up @@ -196,14 +196,14 @@ export const evalGradeCommand = command({
const manifestPath = join(exportDir, 'manifest.json');
const manifest = JSON.parse(await readFile(manifestPath, 'utf8'));
const testIds: string[] = manifest.test_ids;
const datasetName: string = manifest.dataset ?? '';
const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
const suiteName: string = manifest.suite ?? '';
const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';

// Collect all grader tasks upfront so we know the total count
const tasks: GraderTask[] = [];

for (const testId of testIds) {
const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
const testDir = join(exportDir, ...subpath);
const codeGradersDir = join(testDir, 'code_graders');
const resultsDir = join(testDir, 'code_grader_results');
Expand Down
14 changes: 7 additions & 7 deletions apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* Export directory layout:
* <out-dir>/
* ├── manifest.json
* └── <dataset>/ (omitted if eval.yaml has no name)
* └── <suite>/ (omitted if eval.yaml has no name)
* └── <test-id>/
* ├── input.json
* ├── invoke.json
Expand Down Expand Up @@ -58,8 +58,8 @@ export const evalInputCommand = command({
const evalDir = dirname(resolvedEvalPath);

const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
const tests = dataset.tests;
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
const tests = suite.tests;

if (tests.length === 0) {
console.error('No tests found in eval file.');
Expand Down Expand Up @@ -107,13 +107,13 @@ export const evalInputCommand = command({
// No targets file found — subagent-as-target mode
}

const datasetName = dataset.metadata?.name?.trim() ?? '';
const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
const suiteName = suite.metadata?.name?.trim() ?? '';
const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';

const testIds: string[] = [];

for (const test of tests) {
const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id];
const testDir = join(outDir, ...subpath);
await mkdir(testDir, { recursive: true });
testIds.push(test.id);
Expand Down Expand Up @@ -168,7 +168,7 @@ export const evalInputCommand = command({
// manifest.json
await writeJson(join(outDir, 'manifest.json'), {
eval_file: resolvedEvalPath,
dataset: datasetName || undefined,
suite: suiteName || undefined,
experiment: experiment || undefined,
timestamp: new Date().toISOString(),
target: {
Expand Down
16 changes: 8 additions & 8 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ export const evalRunCommand = command({

// ── Step 1: Extract inputs (same as pipeline input) ──────────────
const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
const tests = dataset.tests;
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
const tests = suite.tests;

if (tests.length === 0) {
console.error('No tests found in eval file.');
Expand Down Expand Up @@ -145,13 +145,13 @@ export const evalRunCommand = command({
// No targets file — subagent-as-target mode
}

const datasetName = dataset.metadata?.name?.trim() ?? '';
const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
const suiteName = suite.metadata?.name?.trim() ?? '';
const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';

const testIds: string[] = [];

for (const test of tests) {
const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id];
const testDir = join(outDir, ...subpath);
await mkdir(testDir, { recursive: true });
testIds.push(test.id);
Expand Down Expand Up @@ -198,7 +198,7 @@ export const evalRunCommand = command({

await writeJson(join(outDir, 'manifest.json'), {
eval_file: resolvedEvalPath,
dataset: datasetName || undefined,
suite: suiteName || undefined,
experiment: experiment || undefined,
timestamp: new Date().toISOString(),
target: { name: targetName, kind: targetKind },
Expand Down Expand Up @@ -230,7 +230,7 @@ export const evalRunCommand = command({
writeInvProgress();

const invokeTarget = async (testId: string): Promise<void> => {
const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
const testDir = join(outDir, ...subpath);
const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8'));
if (invoke.kind !== 'cli') return;
Expand Down Expand Up @@ -341,7 +341,7 @@ export const evalRunCommand = command({
const graderTasks: GraderTask[] = [];

for (const testId of testIds) {
const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
const testDir = join(outDir, ...subpath);
const codeGradersDir = join(testDir, 'code_graders');
const resultsDir = join(testDir, 'code_grader_results');
Expand Down
6 changes: 4 additions & 2 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import {
export interface ResultManifestRecord {
readonly timestamp?: string;
readonly test_id?: string;
readonly dataset?: string;
readonly suite?: string;
readonly category?: string;
readonly experiment?: string;
readonly target?: string;
Expand Down Expand Up @@ -123,7 +123,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
return {
timestamp: record.timestamp,
testId,
dataset: record.dataset,
suite: record.suite,
category: record.category,
target: record.target,
score: record.score,
Expand Down Expand Up @@ -189,6 +189,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] {

export interface LightweightResultRecord {
readonly testId: string;
readonly suite?: string;
readonly target?: string;
readonly experiment?: string;
readonly score: number;
Expand All @@ -203,6 +204,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
const content = readFileSync(resolvedSourceFile, 'utf8');
return parseResultManifest(content).map((record) => ({
testId: record.test_id ?? 'unknown',
suite: record.suite,
target: record.target,
experiment: record.experiment,
score: record.score,
Expand Down
Loading
Loading