EntityProcess · christso · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -94,7 +94,7 @@ export interface AggregateGradingArtifact {
 export interface IndexArtifactEntry {
   readonly timestamp: string;
   readonly test_id: string;
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly category?: string;
   readonly conversation_id?: string;
   readonly score: number;
@@ -459,13 +459,13 @@ function safeTestId(testId: string | undefined): string {
   return safeArtifactPathSegment(testId, 'unknown');
 }
 
-function getDataset(result: EvaluationResult): string | undefined {
-  return result.dataset;
+function getSuite(result: EvaluationResult): string | undefined {
+  return result.suite;
 }
 
 function buildArtifactSubdir(result: EvaluationResult): string {
   const segments = [];
-  const evalSet = getDataset(result);
+  const evalSet = getSuite(result);
   if (evalSet) {
     segments.push(safeArtifactPathSegment(evalSet, 'default'));
   }
@@ -504,7 +504,7 @@ export function buildIndexArtifactEntry(
   return {
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
-    dataset: getDataset(result),
+    suite: getSuite(result),
     category: result.category,
     conversation_id: result.conversationId,
     score: result.score,
@@ -536,7 +536,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA
   return {
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
-    dataset: getDataset(result),
+    suite: getSuite(result),
     category: result.category,
     conversation_id: result.conversationId,
     score: result.score,

diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts
@@ -17,8 +17,8 @@ export interface DiscoveredEvalFile {
  * Discover eval files by glob pattern matching.
  *
  * Uses `eval_patterns` from `.agentv/config.yaml` if configured,
- * otherwise falls back to default patterns that match `dataset*.yaml`
- * and `eval.yaml` files under `evals/` directories.
+ * otherwise falls back to default patterns that match `suite*.yaml`,
+ * `eval.yaml`, and `dataset*.yaml` files under `evals/` directories.
  */
 export async function discoverEvalFiles(cwd: string): Promise<readonly DiscoveredEvalFile[]> {
   const repoRoot = await findRepoRoot(cwd);

diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts
@@ -47,7 +47,7 @@ export class JunitWriter {
 
     const grouped = new Map<string, EvaluationResult[]>();
     for (const result of this.results) {
-      const suite = result.dataset ?? 'default';
+      const suite = result.suite ?? 'default';
       const existing = grouped.get(suite);
       if (existing) {
         existing.push(result);

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -480,7 +480,7 @@ async function prepareFileMetadata(params: {
   readonly testCases: readonly EvalTest[];
   readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[];
   readonly trialsConfig?: TrialsConfig;
-  readonly datasetTargets?: readonly string[];
+  readonly suiteTargets?: readonly string[];
   readonly yamlWorkers?: number;
   readonly yamlCache?: boolean;
   readonly yamlCachePath?: string;
@@ -501,23 +501,23 @@ async function prepareFileMetadata(params: {
   const relativePath = path.relative(cwd, testFilePath);
   const category = deriveCategory(relativePath);
 
-  const dataset = await loadTestSuite(testFilePath, repoRoot, {
+  const suite = await loadTestSuite(testFilePath, repoRoot, {
     verbose: options.verbose,
     filter: options.filter,
     category,
   });
-  const testIds = dataset.tests.map((value) => value.id);
+  const testIds = suite.tests.map((value) => value.id);
 
   // Determine target names: CLI --target flags override YAML
   const cliTargets = options.cliTargets;
-  const datasetTargets = dataset.targets;
+  const suiteTargets = suite.targets;
 
-  // Resolve which target names to use (precedence: CLI > dataset YAML targets > default)
+  // Resolve which target names to use (precedence: CLI > suite YAML targets > default)
   let targetNames: readonly string[];
   if (cliTargets.length > 0) {
     targetNames = cliTargets;
-  } else if (datasetTargets && datasetTargets.length > 0) {
-    targetNames = datasetTargets;
+  } else if (suiteTargets && suiteTargets.length > 0) {
+    targetNames = suiteTargets;
   } else {
     targetNames = [];
   }
@@ -568,17 +568,17 @@ async function prepareFileMetadata(params: {
 
   return {
     testIds,
-    testCases: dataset.tests,
+    testCases: suite.tests,
     selections,
-    trialsConfig: dataset.trials,
-    datasetTargets,
-    yamlWorkers: dataset.workers,
-    yamlCache: dataset.cacheConfig?.enabled,
-    yamlCachePath: dataset.cacheConfig?.cachePath,
-    totalBudgetUsd: dataset.totalBudgetUsd,
-    failOnError: dataset.failOnError,
-    threshold: dataset.threshold,
-    tags: dataset.metadata?.tags,
+    trialsConfig: suite.trials,
+    suiteTargets,
+    yamlWorkers: suite.workers,
+    yamlCache: suite.cacheConfig?.enabled,
+    yamlCachePath: suite.cacheConfig?.cachePath,
+    totalBudgetUsd: suite.totalBudgetUsd,
+    failOnError: suite.failOnError,
+    threshold: suite.threshold,
+    tags: suite.metadata?.tags,
   };
 }
 
@@ -1021,7 +1021,7 @@ export async function runEvalCommand(
         inlineTargetLabel: string;
       }[];
       readonly trialsConfig?: TrialsConfig;
-      readonly datasetTargets?: readonly string[];
+      readonly suiteTargets?: readonly string[];
       readonly yamlWorkers?: number;
       readonly yamlCache?: boolean;
       readonly yamlCachePath?: string;
@@ -1104,7 +1104,7 @@ export async function runEvalCommand(
     console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
   }
 
-  // Resolve dataset-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
+  // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
   const yamlThreshold = firstMeta?.threshold;
   const resolvedThreshold = options.threshold ?? yamlThreshold;
   if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
@@ -1128,13 +1128,13 @@ export async function runEvalCommand(
   // In matrix mode, total eval count is tests × targets (accounting for per-test target overrides)
   let totalEvalCount = 0;
   for (const meta of fileMetadata.values()) {
-    const datasetTargetNames = meta.selections.map((s) => s.selection.targetName);
+    const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
     for (const test of meta.testCases) {
-      // Per-test targets override dataset-level targets.
+      // Per-test targets override suite-level targets.
       const testTargetNames =
         test.targets && test.targets.length > 0
-          ? test.targets.filter((t) => datasetTargetNames.includes(t))
-          : datasetTargetNames;
+          ? test.targets.filter((t) => suiteTargetNames.includes(t))
+          : suiteTargetNames;
       totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
     }
   }

diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -37,15 +37,15 @@ export const evalBenchCommand = command({
     const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
     const testIds: string[] = manifest.test_ids;
     const targetName: string = manifest.target?.name ?? 'unknown';
-    const datasetName: string = manifest.dataset ?? '';
+    const suiteName: string = manifest.suite ?? '';
     const experiment: string | undefined = manifest.experiment;
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const indexLines: string[] = [];
     const allPassRates: number[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+      const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
       const testDir = join(exportDir, ...subpath);
       const artifactSubdir = subpath.join('/');
       const evaluators: EvaluatorScore[] = [];
@@ -177,7 +177,7 @@ export const evalBenchCommand = command({
         JSON.stringify({
           timestamp: manifest.timestamp,
           test_id: testId,
-          dataset: datasetName || undefined,
+          suite: suiteName || undefined,
           experiment: experiment || undefined,
           score: Math.round(weightedScore * 1000) / 1000,
           target: targetName,

diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
@@ -10,7 +10,7 @@
  * Progress is printed to stderr so users see real-time feedback.
  *
  * Export directory additions:
- *   <out-dir>/<dataset>/<test-id>/code_grader_results/<name>.json
+ *   <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
  */
 import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
@@ -196,14 +196,14 @@ export const evalGradeCommand = command({
     const manifestPath = join(exportDir, 'manifest.json');
     const manifest = JSON.parse(await readFile(manifestPath, 'utf8'));
     const testIds: string[] = manifest.test_ids;
-    const datasetName: string = manifest.dataset ?? '';
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const suiteName: string = manifest.suite ?? '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     // Collect all grader tasks upfront so we know the total count
     const tasks: GraderTask[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+      const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
       const testDir = join(exportDir, ...subpath);
       const codeGradersDir = join(testDir, 'code_graders');
       const resultsDir = join(testDir, 'code_grader_results');

diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
@@ -9,7 +9,7 @@
  * Export directory layout:
  *   <out-dir>/
  *   ├── manifest.json
- *   └── <dataset>/               (omitted if eval.yaml has no name)
+ *   └── <suite>/                (omitted if eval.yaml has no name)
  *       └── <test-id>/
  *           ├── input.json
  *           ├── invoke.json
@@ -58,8 +58,8 @@ export const evalInputCommand = command({
     const evalDir = dirname(resolvedEvalPath);
 
     const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
-    const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
-    const tests = dataset.tests;
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
+    const tests = suite.tests;
 
     if (tests.length === 0) {
       console.error('No tests found in eval file.');
@@ -107,13 +107,13 @@ export const evalInputCommand = command({
       // No targets file found — subagent-as-target mode
     }
 
-    const datasetName = dataset.metadata?.name?.trim() ?? '';
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const suiteName = suite.metadata?.name?.trim() ?? '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];
 
     for (const test of tests) {
-      const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
+      const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id];
       const testDir = join(outDir, ...subpath);
       await mkdir(testDir, { recursive: true });
       testIds.push(test.id);
@@ -168,7 +168,7 @@ export const evalInputCommand = command({
     // manifest.json
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      dataset: datasetName || undefined,
+      suite: suiteName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: {

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -100,8 +100,8 @@ export const evalRunCommand = command({
 
     // ── Step 1: Extract inputs (same as pipeline input) ──────────────
     const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
-    const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
-    const tests = dataset.tests;
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
+    const tests = suite.tests;
 
     if (tests.length === 0) {
       console.error('No tests found in eval file.');
@@ -145,13 +145,13 @@ export const evalRunCommand = command({
       // No targets file — subagent-as-target mode
     }
 
-    const datasetName = dataset.metadata?.name?.trim() ?? '';
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const suiteName = suite.metadata?.name?.trim() ?? '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];
 
     for (const test of tests) {
-      const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
+      const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id];
       const testDir = join(outDir, ...subpath);
       await mkdir(testDir, { recursive: true });
       testIds.push(test.id);
@@ -198,7 +198,7 @@ export const evalRunCommand = command({
 
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      dataset: datasetName || undefined,
+      suite: suiteName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: { name: targetName, kind: targetKind },
@@ -230,7 +230,7 @@ export const evalRunCommand = command({
       writeInvProgress();
 
       const invokeTarget = async (testId: string): Promise<void> => {
-        const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+        const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
         const testDir = join(outDir, ...subpath);
         const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8'));
         if (invoke.kind !== 'cli') return;
@@ -341,7 +341,7 @@ export const evalRunCommand = command({
     const graderTasks: GraderTask[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+      const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
       const testDir = join(outDir, ...subpath);
       const codeGradersDir = join(testDir, 'code_graders');
       const resultsDir = join(testDir, 'code_grader_results');

diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
@@ -13,7 +13,7 @@ import {
 export interface ResultManifestRecord {
   readonly timestamp?: string;
   readonly test_id?: string;
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly category?: string;
   readonly experiment?: string;
   readonly target?: string;
@@ -123,7 +123,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
   return {
     timestamp: record.timestamp,
     testId,
-    dataset: record.dataset,
+    suite: record.suite,
     category: record.category,
     target: record.target,
     score: record.score,
@@ -189,6 +189,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] {
 
 export interface LightweightResultRecord {
   readonly testId: string;
+  readonly suite?: string;
   readonly target?: string;
   readonly experiment?: string;
   readonly score: number;
@@ -203,6 +204,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
   const content = readFileSync(resolvedSourceFile, 'utf8');
   return parseResultManifest(content).map((record) => ({
     testId: record.test_id ?? 'unknown',
+    suite: record.suite,
     target: record.target,
     experiment: record.experiment,
     score: record.score,