From 5684e68cd7d137f3c4aff71688f35664ca9ebe15 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sun, 5 Apr 2026 09:33:13 +0000 Subject: [PATCH 1/4] refactor: rename dataset to suite across codebase (#943) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An eval file is a test suite (lifecycle hooks, workspace setup/teardown, execution config), not a dataset (passive input/output pairs). This renames the `dataset` field to `suite` everywhere: - Core types: EvalTest.suite, EvaluationResult.suite - Wire format: JSONL results write `suite` field - CLI: --group-by suite, --suite flag, trace/trend/pipeline commands - Studio UI: routes /suite/, labels "Suites", API endpoints /suites - OTel: agentv.suite attribute - Example baseline JSONL files updated - Documentation and plugin skill files updated Hard deprecation — no backward-compat aliases for `dataset`. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/artifact-writer.ts | 12 +-- apps/cli/src/commands/eval/discover.ts | 4 +- apps/cli/src/commands/eval/junit-writer.ts | 2 +- apps/cli/src/commands/eval/run-eval.ts | 46 ++++----- apps/cli/src/commands/pipeline/bench.ts | 8 +- apps/cli/src/commands/pipeline/grade.ts | 8 +- apps/cli/src/commands/pipeline/input.ts | 14 +-- apps/cli/src/commands/pipeline/run.ts | 16 +-- apps/cli/src/commands/results/manifest.ts | 6 +- apps/cli/src/commands/results/serve.ts | 56 +++++------ apps/cli/src/commands/trace/show.ts | 2 +- apps/cli/src/commands/trace/stats.ts | 4 +- apps/cli/src/commands/trace/utils.ts | 6 +- apps/cli/src/commands/trend/index.ts | 34 +++---- .../commands/eval/artifact-writer.test.ts | 8 +- .../test/commands/eval/output-writers.test.ts | 10 +- .../results/export-e2e-providers.test.ts | 22 ++--- apps/cli/test/commands/results/export.test.ts | 12 +-- apps/cli/test/commands/results/serve.test.ts | 4 +- apps/cli/test/commands/trace/trace.test.ts | 6 +- apps/cli/test/commands/trend/trend.test.ts | 98 +++++++++---------- apps/studio/src/components/Breadcrumbs.tsx | 4 +- apps/studio/src/components/RunDetail.tsx | 50 +++++----- apps/studio/src/components/ScoreBar.tsx | 2 +- apps/studio/src/components/Sidebar.tsx | 36 ++++--- apps/studio/src/lib/api.ts | 40 ++++---- apps/studio/src/lib/types.ts | 10 +- apps/studio/src/routeTree.gen.ts | 34 +++---- .../runs/$runId_.category.$category.tsx | 24 ++--- ....$dataset.tsx => $runId_.suite.$suite.tsx} | 20 ++-- .../docs/docs/evaluation/eval-files.mdx | 4 +- .../web/src/content/docs/docs/tools/trace.mdx | 4 +- .../web/src/content/docs/docs/tools/trend.mdx | 16 +-- .../assert/evals/dataset.eval.baseline.jsonl | 8 +- .../evals/dataset.eval.baseline.jsonl | 14 +-- .../basic/evals/dataset.eval.baseline.jsonl | 14 +-- .../evals/dataset.eval.baseline.jsonl | 8 +- .../evals/dataset.eval.baseline.jsonl | 2 +- .../contextual-precision.eval.baseline.jsonl | 6 +- .../contextual-recall.eval.baseline.jsonl | 6 +- .../evals/dataset.eval.baseline.jsonl | 8 +- .../evals/dataset.eval.baseline.jsonl | 6 +- .../evals/dataset.eval.baseline.jsonl | 14 +-- .../confusion-metrics.eval.baseline.jsonl | 10 +- .../evals/field-accuracy.eval.baseline.jsonl | 12 +-- .../evals/dataset.eval.baseline.jsonl | 12 +-- .../evals/dataset.eval.baseline.jsonl | 10 +- .../evals/dataset.eval.baseline.jsonl | 4 +- .../evals/dataset.eval.baseline.jsonl | 2 +- .../evals/dataset.eval.baseline.jsonl | 10 +- .../evals/dataset.eval.baseline.jsonl | 2 +- .../evals/dataset.eval.baseline.jsonl | 10 +- .../evals/dataset.eval.baseline.jsonl | 4 +- .../evals/dataset.eval.baseline.jsonl | 10 +- .../evals/dataset.eval.baseline.jsonl | 4 +- .../rubric/evals/dataset.eval.baseline.jsonl | 10 +- .../evals/dataset.eval.baseline.jsonl | 4 +- .../evals/dataset.eval.baseline.jsonl | 6 +- .../evals/dataset.eval.baseline.jsonl | 6 +- .../evals/dataset.eval.baseline.jsonl | 2 +- .../evals/dataset.eval.baseline.jsonl | 6 +- .../evals/trace-file-demo.eval.baseline.jsonl | 12 +-- .../evals/dataset.eval.baseline.jsonl | 14 +-- .../evals/multi-agent.eval.results.jsonl | 10 +- .../evals/dataset.eval.baseline.jsonl | 10 +- examples/features/trend/README.md | 14 +-- .../2026-03-01T10-00-00-000Z/index.jsonl | 6 +- .../2026-03-08T10-00-00-000Z/index.jsonl | 6 +- .../2026-03-15T10-00-00-000Z/index.jsonl | 6 +- .../trials/evals/dataset.eval.baseline.jsonl | 4 +- .../evals/dataset.eval.baseline.jsonl | 6 +- .../evals/dataset.eval.baseline.jsonl | 16 +-- .../evals/dataset.eval.baseline.jsonl | 44 ++++----- .../fixtures/setup-a.raw.jsonl | 10 +- .../fixtures/setup-b.raw.jsonl | 10 +- .../evals/encouragement.eval.baseline.jsonl | 16 +-- .../evals/listening.eval.baseline.jsonl | 12 +-- .../evals/routing.eval.baseline.jsonl | 8 +- .../tool-eval-demo.baseline.jsonl | 8 +- .../src/evaluation/loaders/jsonl-parser.ts | 16 +-- packages/core/src/evaluation/orchestrator.ts | 8 +- packages/core/src/evaluation/types.ts | 4 +- .../evaluation/validation/eval-file.schema.ts | 2 +- .../evaluation/validation/eval-validator.ts | 2 +- packages/core/src/evaluation/yaml-parser.ts | 12 +-- .../core/src/observability/otel-exporter.ts | 4 +- .../core/test/evaluation/baseline.test.ts | 4 +- .../code-evaluator-file-backed.test.ts | 2 +- .../code-evaluator-multimodal.test.ts | 2 +- .../core/test/evaluation/evaluators.test.ts | 2 +- .../evaluators/composite-threshold.test.ts | 2 +- .../evaluators/execution-metrics.test.ts | 2 +- .../evaluation/evaluators_variables.test.ts | 2 +- .../test/evaluation/execution-metrics.test.ts | 2 +- .../test/evaluation/execution-status.test.ts | 2 +- .../evaluation/llm-grader-multimodal.test.ts | 2 +- .../evaluation/loaders/jsonl-parser.test.ts | 10 +- .../core/test/evaluation/orchestrator.test.ts | 14 +-- .../observability/streaming-observer.test.ts | 6 +- .../skills/agentv-bench/agents/analyzer.md | 2 +- .../skills/agentv-eval-writer/SKILL.md | 2 +- .../references/eval-schema.json | 4 +- .../skills/agentv-trace-analyst/SKILL.md | 12 +-- 103 files changed, 572 insertions(+), 592 deletions(-) rename apps/studio/src/routes/runs/{$runId_.dataset.$dataset.tsx => $runId_.suite.$suite.tsx} (88%) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 4c072d661..03c2e901f 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -94,7 +94,7 @@ export interface AggregateGradingArtifact { export interface IndexArtifactEntry { readonly timestamp: string; readonly test_id: string; - readonly dataset?: string; + readonly suite?: string; readonly category?: string; readonly conversation_id?: string; readonly score: number; @@ -459,13 +459,13 @@ function safeTestId(testId: string | undefined): string { return safeArtifactPathSegment(testId, 'unknown'); } -function getDataset(result: EvaluationResult): string | undefined { - return result.dataset; +function getSuite(result: EvaluationResult): string | undefined { + return result.suite; } function buildArtifactSubdir(result: EvaluationResult): string { const segments = []; - const evalSet = getDataset(result); + const evalSet = getSuite(result); if (evalSet) { segments.push(safeArtifactPathSegment(evalSet, 'default')); } @@ -504,7 +504,7 @@ export function buildIndexArtifactEntry( return { timestamp: result.timestamp, test_id: result.testId ?? 'unknown', - dataset: getDataset(result), + suite: getSuite(result), category: result.category, conversation_id: result.conversationId, score: result.score, @@ -536,7 +536,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA return { timestamp: result.timestamp, test_id: result.testId ?? 'unknown', - dataset: getDataset(result), + suite: getSuite(result), category: result.category, conversation_id: result.conversationId, score: result.score, diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts index f8ea59e19..cbb57c44b 100644 --- a/apps/cli/src/commands/eval/discover.ts +++ b/apps/cli/src/commands/eval/discover.ts @@ -17,8 +17,8 @@ export interface DiscoveredEvalFile { * Discover eval files by glob pattern matching. * * Uses `eval_patterns` from `.agentv/config.yaml` if configured, - * otherwise falls back to default patterns that match `dataset*.yaml` - * and `eval.yaml` files under `evals/` directories. + * otherwise falls back to default patterns that match `suite*.yaml`, + * `eval.yaml`, and legacy `dataset*.yaml` files under `evals/` directories. */ export async function discoverEvalFiles(cwd: string): Promise { const repoRoot = await findRepoRoot(cwd); diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts index c53beca45..7e8e88d95 100644 --- a/apps/cli/src/commands/eval/junit-writer.ts +++ b/apps/cli/src/commands/eval/junit-writer.ts @@ -47,7 +47,7 @@ export class JunitWriter { const grouped = new Map(); for (const result of this.results) { - const suite = result.dataset ?? 'default'; + const suite = result.suite ?? 'default'; const existing = grouped.get(suite); if (existing) { existing.push(result); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 53544e4e0..61aab86a6 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -480,7 +480,7 @@ async function prepareFileMetadata(params: { readonly testCases: readonly EvalTest[]; readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[]; readonly trialsConfig?: TrialsConfig; - readonly datasetTargets?: readonly string[]; + readonly suiteTargets?: readonly string[]; readonly yamlWorkers?: number; readonly yamlCache?: boolean; readonly yamlCachePath?: string; @@ -501,23 +501,23 @@ async function prepareFileMetadata(params: { const relativePath = path.relative(cwd, testFilePath); const category = deriveCategory(relativePath); - const dataset = await loadTestSuite(testFilePath, repoRoot, { + const suite = await loadTestSuite(testFilePath, repoRoot, { verbose: options.verbose, filter: options.filter, category, }); - const testIds = dataset.tests.map((value) => value.id); + const testIds = suite.tests.map((value) => value.id); // Determine target names: CLI --target flags override YAML const cliTargets = options.cliTargets; - const datasetTargets = dataset.targets; + const suiteTargets = suite.targets; - // Resolve which target names to use (precedence: CLI > dataset YAML targets > default) + // Resolve which target names to use (precedence: CLI > suite YAML targets > default) let targetNames: readonly string[]; if (cliTargets.length > 0) { targetNames = cliTargets; - } else if (datasetTargets && datasetTargets.length > 0) { - targetNames = datasetTargets; + } else if (suiteTargets && suiteTargets.length > 0) { + targetNames = suiteTargets; } else { targetNames = []; } @@ -568,17 +568,17 @@ async function prepareFileMetadata(params: { return { testIds, - testCases: dataset.tests, + testCases: suite.tests, selections, - trialsConfig: dataset.trials, - datasetTargets, - yamlWorkers: dataset.workers, - yamlCache: dataset.cacheConfig?.enabled, - yamlCachePath: dataset.cacheConfig?.cachePath, - totalBudgetUsd: dataset.totalBudgetUsd, - failOnError: dataset.failOnError, - threshold: dataset.threshold, - tags: dataset.metadata?.tags, + trialsConfig: suite.trials, + suiteTargets, + yamlWorkers: suite.workers, + yamlCache: suite.cacheConfig?.enabled, + yamlCachePath: suite.cacheConfig?.cachePath, + totalBudgetUsd: suite.totalBudgetUsd, + failOnError: suite.failOnError, + threshold: suite.threshold, + tags: suite.metadata?.tags, }; } @@ -1021,7 +1021,7 @@ export async function runEvalCommand( inlineTargetLabel: string; }[]; readonly trialsConfig?: TrialsConfig; - readonly datasetTargets?: readonly string[]; + readonly suiteTargets?: readonly string[]; readonly yamlWorkers?: number; readonly yamlCache?: boolean; readonly yamlCachePath?: string; @@ -1104,7 +1104,7 @@ export async function runEvalCommand( console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`); } - // Resolve dataset-level threshold: CLI --threshold takes precedence over YAML execution.threshold. + // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold. const yamlThreshold = firstMeta?.threshold; const resolvedThreshold = options.threshold ?? yamlThreshold; if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) { @@ -1128,13 +1128,13 @@ export async function runEvalCommand( // In matrix mode, total eval count is tests × targets (accounting for per-test target overrides) let totalEvalCount = 0; for (const meta of fileMetadata.values()) { - const datasetTargetNames = meta.selections.map((s) => s.selection.targetName); + const suiteTargetNames = meta.selections.map((s) => s.selection.targetName); for (const test of meta.testCases) { - // Per-test targets override dataset-level targets. + // Per-test targets override suite-level targets. const testTargetNames = test.targets && test.targets.length > 0 - ? test.targets.filter((t) => datasetTargetNames.includes(t)) - : datasetTargetNames; + ? test.targets.filter((t) => suiteTargetNames.includes(t)) + : suiteTargetNames; totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1; } } diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index ee355c5b2..cb4bcc4ad 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -37,15 +37,15 @@ export const evalBenchCommand = command({ const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8')); const testIds: string[] = manifest.test_ids; const targetName: string = manifest.target?.name ?? 'unknown'; - const datasetName: string = manifest.dataset ?? ''; + const suiteName: string = manifest.suite ?? ''; const experiment: string | undefined = manifest.experiment; - const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const indexLines: string[] = []; const allPassRates: number[] = []; for (const testId of testIds) { - const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; + const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId]; const testDir = join(exportDir, ...subpath); const artifactSubdir = subpath.join('/'); const evaluators: EvaluatorScore[] = []; @@ -177,7 +177,7 @@ export const evalBenchCommand = command({ JSON.stringify({ timestamp: manifest.timestamp, test_id: testId, - dataset: datasetName || undefined, + suite: suiteName || undefined, experiment: experiment || undefined, score: Math.round(weightedScore * 1000) / 1000, target: targetName, diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index 45faa8608..b9263c399 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -10,7 +10,7 @@ * Progress is printed to stderr so users see real-time feedback. * * Export directory additions: - * ///code_grader_results/.json + * ///code_grader_results/.json */ import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; @@ -196,14 +196,14 @@ export const evalGradeCommand = command({ const manifestPath = join(exportDir, 'manifest.json'); const manifest = JSON.parse(await readFile(manifestPath, 'utf8')); const testIds: string[] = manifest.test_ids; - const datasetName: string = manifest.dataset ?? ''; - const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const suiteName: string = manifest.suite ?? ''; + const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; // Collect all grader tasks upfront so we know the total count const tasks: GraderTask[] = []; for (const testId of testIds) { - const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; + const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId]; const testDir = join(exportDir, ...subpath); const codeGradersDir = join(testDir, 'code_graders'); const resultsDir = join(testDir, 'code_grader_results'); diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 28b43b391..3eb7ad0a4 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -9,7 +9,7 @@ * Export directory layout: * / * ├── manifest.json - * └── / (omitted if eval.yaml has no name) + * └── / (omitted if eval.yaml has no name) * └── / * ├── input.json * ├── invoke.json @@ -58,8 +58,8 @@ export const evalInputCommand = command({ const evalDir = dirname(resolvedEvalPath); const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); - const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); - const tests = dataset.tests; + const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); + const tests = suite.tests; if (tests.length === 0) { console.error('No tests found in eval file.'); @@ -107,13 +107,13 @@ export const evalInputCommand = command({ // No targets file found — subagent-as-target mode } - const datasetName = dataset.metadata?.name?.trim() ?? ''; - const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const suiteName = suite.metadata?.name?.trim() ?? ''; + const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const testIds: string[] = []; for (const test of tests) { - const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id]; + const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id]; const testDir = join(outDir, ...subpath); await mkdir(testDir, { recursive: true }); testIds.push(test.id); @@ -168,7 +168,7 @@ export const evalInputCommand = command({ // manifest.json await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, - dataset: datasetName || undefined, + suite: suiteName || undefined, experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 372bfd04f..f91db3dad 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -100,8 +100,8 @@ export const evalRunCommand = command({ // ── Step 1: Extract inputs (same as pipeline input) ────────────── const category = deriveCategory(relative(process.cwd(), resolvedEvalPath)); - const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); - const tests = dataset.tests; + const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category }); + const tests = suite.tests; if (tests.length === 0) { console.error('No tests found in eval file.'); @@ -145,13 +145,13 @@ export const evalRunCommand = command({ // No targets file — subagent-as-target mode } - const datasetName = dataset.metadata?.name?.trim() ?? ''; - const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : ''; + const suiteName = suite.metadata?.name?.trim() ?? ''; + const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const testIds: string[] = []; for (const test of tests) { - const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id]; + const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id]; const testDir = join(outDir, ...subpath); await mkdir(testDir, { recursive: true }); testIds.push(test.id); @@ -198,7 +198,7 @@ export const evalRunCommand = command({ await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, - dataset: datasetName || undefined, + suite: suiteName || undefined, experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { name: targetName, kind: targetKind }, @@ -230,7 +230,7 @@ export const evalRunCommand = command({ writeInvProgress(); const invokeTarget = async (testId: string): Promise => { - const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; + const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId]; const testDir = join(outDir, ...subpath); const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8')); if (invoke.kind !== 'cli') return; @@ -341,7 +341,7 @@ export const evalRunCommand = command({ const graderTasks: GraderTask[] = []; for (const testId of testIds) { - const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId]; + const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId]; const testDir = join(outDir, ...subpath); const codeGradersDir = join(testDir, 'code_graders'); const resultsDir = join(testDir, 'code_grader_results'); diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index cffb4760a..98e8a5527 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -13,7 +13,7 @@ import { export interface ResultManifestRecord { readonly timestamp?: string; readonly test_id?: string; - readonly dataset?: string; + readonly suite?: string; readonly category?: string; readonly experiment?: string; readonly target?: string; @@ -123,7 +123,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E return { timestamp: record.timestamp, testId, - dataset: record.dataset, + suite: record.suite, category: record.category, target: record.target, score: record.score, @@ -189,6 +189,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] { export interface LightweightResultRecord { readonly testId: string; + readonly suite?: string; readonly target?: string; readonly experiment?: string; readonly score: number; @@ -203,6 +204,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec const content = readFileSync(resolvedSourceFile, 'utf8'); return parseResultManifest(content).map((record) => ({ testId: record.test_id ?? 'unknown', + suite: record.suite, target: record.target, experiment: record.experiment, score: record.score, diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index e00e7e837..92f2b20d5 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -14,7 +14,7 @@ * - GET /api/projects — list registered projects * - GET /api/projects/:projectId/runs — project-scoped run list * - * All data routes (runs, datasets, categories, evals, experiments, targets) + * All data routes (runs, suites, categories, evals, experiments, targets) * exist in both unscoped (/api/...) and project-scoped (/api/projects/:projectId/...) * variants. They share handler functions via DataContext, differing only in * how searchDir is resolved. @@ -275,32 +275,32 @@ function handleRunDetail(c: C, { searchDir }: DataContext) { } } -function handleRunDatasets(c: C, { searchDir, agentvDir }: DataContext) { +function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) { const filename = c.req.param('filename'); const meta = listResultFiles(searchDir).find((m) => m.filename === filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); const { pass_threshold } = loadStudioConfig(agentvDir); - const datasetMap = new Map(); + const suiteMap = new Map(); for (const r of loaded) { - const ds = r.dataset ?? r.target ?? 'default'; - const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; + const ds = r.suite ?? r.target ?? 'default'; + const entry = suiteMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; - datasetMap.set(ds, entry); + suiteMap.set(ds, entry); } - const datasets = [...datasetMap.entries()].map(([name, entry]) => ({ + const suites = [...suiteMap.entries()].map(([name, entry]) => ({ name, total: entry.total, passed: entry.passed, failed: entry.total - entry.passed, avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, })); - return c.json({ datasets }); + return c.json({ suites }); } catch { - return c.json({ error: 'Failed to load datasets' }, 500); + return c.json({ error: 'Failed to load suites' }, 500); } } @@ -313,7 +313,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { const { pass_threshold } = loadStudioConfig(agentvDir); const categoryMap = new Map< string, - { total: number; passed: number; scoreSum: number; datasets: Set } + { total: number; passed: number; scoreSum: number; suites: Set } >(); for (const r of loaded) { const cat = r.category ?? DEFAULT_CATEGORY; @@ -321,12 +321,12 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { total: 0, passed: 0, scoreSum: 0, - datasets: new Set(), + suites: new Set(), }; entry.total++; if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; - entry.datasets.add(r.dataset ?? r.target ?? 'default'); + entry.suites.add(r.suite ?? r.target ?? 'default'); categoryMap.set(cat, entry); } const categories = [...categoryMap.entries()].map(([name, entry]) => ({ @@ -335,7 +335,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { passed: entry.passed, failed: entry.total - entry.passed, avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, - dataset_count: entry.datasets.size, + suite_count: entry.suites.size, })); return c.json({ categories }); } catch { @@ -343,7 +343,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { } } -function handleCategoryDatasets(c: C, { searchDir, agentvDir }: DataContext) { +function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) { const filename = c.req.param('filename'); const category = decodeURIComponent(c.req.param('category') ?? ''); const meta = listResultFiles(searchDir).find((m) => m.filename === filename); @@ -352,25 +352,25 @@ function handleCategoryDatasets(c: C, { searchDir, agentvDir }: DataContext) { const loaded = loadManifestResults(meta.path); const { pass_threshold } = loadStudioConfig(agentvDir); const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); - const datasetMap = new Map(); + const suiteMap = new Map(); for (const r of filtered) { - const ds = r.dataset ?? r.target ?? 'default'; - const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; + const ds = r.suite ?? r.target ?? 'default'; + const entry = suiteMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 }; entry.total++; if (r.score >= pass_threshold) entry.passed++; entry.scoreSum += r.score; - datasetMap.set(ds, entry); + suiteMap.set(ds, entry); } - const datasets = [...datasetMap.entries()].map(([name, entry]) => ({ + const suites = [...suiteMap.entries()].map(([name, entry]) => ({ name, total: entry.total, passed: entry.passed, failed: entry.total - entry.passed, avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0, })); - return c.json({ datasets }); + return c.json({ suites }); } catch { - return c.json({ error: 'Failed to load datasets' }, 500); + return c.json({ error: 'Failed to load suites' }, 500); } } @@ -780,10 +780,10 @@ export function createApp( app.get('/api/config', (c) => handleConfig(c, defaultCtx)); app.get('/api/runs', (c) => handleRuns(c, defaultCtx)); app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx)); - app.get('/api/runs/:filename/datasets', (c) => handleRunDatasets(c, defaultCtx)); + app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx)); app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx)); - app.get('/api/runs/:filename/categories/:category/datasets', (c) => - handleCategoryDatasets(c, defaultCtx), + app.get('/api/runs/:filename/categories/:category/suites', (c) => + handleCategorySuites(c, defaultCtx), ); app.get('/api/runs/:filename/evals/:evalId', (c) => handleEvalDetail(c, defaultCtx)); app.get('/api/runs/:filename/evals/:evalId/files', (c) => handleEvalFiles(c, defaultCtx)); @@ -872,14 +872,12 @@ export function createApp( app.get('/api/projects/:projectId/config', (c) => withProject(c, handleConfig)); app.get('/api/projects/:projectId/runs', (c) => withProject(c, handleRuns)); app.get('/api/projects/:projectId/runs/:filename', (c) => withProject(c, handleRunDetail)); - app.get('/api/projects/:projectId/runs/:filename/datasets', (c) => - withProject(c, handleRunDatasets), - ); + app.get('/api/projects/:projectId/runs/:filename/suites', (c) => withProject(c, handleRunSuites)); app.get('/api/projects/:projectId/runs/:filename/categories', (c) => withProject(c, handleRunCategories), ); - app.get('/api/projects/:projectId/runs/:filename/categories/:category/datasets', (c) => - withProject(c, handleCategoryDatasets), + app.get('/api/projects/:projectId/runs/:filename/categories/:category/suites', (c) => + withProject(c, handleCategorySuites), ); app.get('/api/projects/:projectId/runs/:filename/evals/:evalId', (c) => withProject(c, handleEvalDetail), diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/trace/show.ts index 598a4753f..50e12f7e7 100644 --- a/apps/cli/src/commands/trace/show.ts +++ b/apps/cli/src/commands/trace/show.ts @@ -225,7 +225,7 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st // Standard flat view const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red; lines.push( - `${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.dataset ? ` ${c.dim}dataset: ${result.dataset}${c.reset}` : ''}`, + `${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.suite ? ` ${c.dim}suite: ${result.suite}${c.reset}` : ''}`, ); if (result.error) { diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/trace/stats.ts index 6a88d10d0..cf3df312c 100644 --- a/apps/cli/src/commands/trace/stats.ts +++ b/apps/cli/src/commands/trace/stats.ts @@ -109,8 +109,8 @@ function groupResults(results: RawResult[], groupBy?: string): GroupedResults[] case 'target': key = result.target ?? 'unknown'; break; - case 'dataset': - key = result.dataset ?? 'unknown'; + case 'suite': + key = result.suite ?? 'unknown'; break; case 'test-id': key = result.test_id ?? result.eval_id ?? 'unknown'; diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts index 443a1466f..f10a97ab4 100644 --- a/apps/cli/src/commands/trace/utils.ts +++ b/apps/cli/src/commands/trace/utils.ts @@ -51,7 +51,7 @@ export interface RawResult { timestamp?: string; test_id?: string; eval_id?: string; - dataset?: string; + suite?: string; conversation_id?: string; score: number; assertions?: { text: string; passed: boolean; evidence?: string }[]; @@ -149,7 +149,7 @@ function toRawResult(result: EvaluationResult): RawResult { return { timestamp: result.timestamp, test_id: result.testId, - dataset: result.dataset, + suite: result.suite, conversation_id: result.conversationId, score: result.score, assertions: result.assertions?.map((assertion) => ({ @@ -334,7 +334,7 @@ function loadOtlpTraceFile(filePath: string): RawResult[] { stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`, - dataset: stringAttr(rootAttrs.agentv_dataset), + suite: stringAttr(rootAttrs.agentv_suite), target: stringAttr(rootAttrs.agentv_target), score, error: root.status?.code === 2 ? root.status.message : undefined, diff --git a/apps/cli/src/commands/trend/index.ts b/apps/cli/src/commands/trend/index.ts index edd616d77..7ef5218ff 100644 --- a/apps/cli/src/commands/trend/index.ts +++ b/apps/cli/src/commands/trend/index.ts @@ -39,7 +39,7 @@ export interface TrendRunPoint { } export interface TrendFilters { - readonly dataset?: string; + readonly suite?: string; readonly target?: string; readonly allowMissingTests: boolean; } @@ -163,11 +163,11 @@ export function resolveTrendSources( function filterRunRecords( records: readonly LightweightResultRecord[], sourcePath: string, - dataset?: string, + suite?: string, target?: string, ): TrendRunRecord[] { return records - .filter((record) => (dataset ? record.dataset === dataset : true)) + .filter((record) => (suite ? record.suite === suite : true)) .filter((record) => (target ? record.target === target : true)) .map((record) => ({ ...record, sourcePath })); } @@ -268,28 +268,22 @@ export function determineTrendExitCode( export function analyzeTrend(params: { readonly sourcePaths: readonly string[]; - readonly dataset?: string; + readonly suite?: string; readonly target?: string; readonly slopeThreshold: number; readonly allowMissingTests: boolean; readonly failOnDegrading: boolean; }): TrendOutput { - const { sourcePaths, dataset, target, slopeThreshold, allowMissingTests, failOnDegrading } = - params; + const { sourcePaths, suite, target, slopeThreshold, allowMissingTests, failOnDegrading } = params; if (sourcePaths.length < 2) { throw new Error('Trend analysis requires at least 2 runs'); } const filteredRuns = sourcePaths.map((sourcePath) => { - const records = filterRunRecords( - loadLightweightResults(sourcePath), - sourcePath, - dataset, - target, - ); + const records = filterRunRecords(loadLightweightResults(sourcePath), sourcePath, suite, target); if (records.length === 0) { - const filters = [dataset ? `dataset=${dataset}` : '', target ? `target=${target}` : ''] + const filters = [suite ? `suite=${suite}` : '', target ? `target=${target}` : ''] .filter(Boolean) .join(', '); const suffix = filters ? ` after filtering by ${filters}` : ''; @@ -339,7 +333,7 @@ export function analyzeTrend(params: { return { runs, filters: { - dataset, + suite, target, allowMissingTests, }, @@ -377,7 +371,7 @@ export function formatTrendTable(output: TrendOutput): string { `${c.bold}Runs:${c.reset} ${output.summary.runCount} | ${c.bold}Range:${c.reset} ${output.summary.dateRange.start ?? 'unknown'} → ${output.summary.dateRange.end ?? 'unknown'}`, ); lines.push( - `${c.bold}Filters:${c.reset} dataset=${output.filters.dataset ?? '*'} target=${output.filters.target ?? '*'} mode=${output.filters.allowMissingTests ? 'independent' : 'matched-tests'}`, + `${c.bold}Filters:${c.reset} suite=${output.filters.suite ?? '*'} target=${output.filters.target ?? '*'} mode=${output.filters.allowMissingTests ? 'independent' : 'matched-tests'}`, ); lines.push( `${c.bold}Matched Tests:${c.reset} ${output.summary.matchedTestCount} | ${c.bold}Verdict:${c.reset} ${colorizeDirection(output.summary.direction)}`, @@ -422,10 +416,10 @@ export const trendCommand = command({ long: 'last', description: 'Use the most recent N runs from .agentv/results/runs/', }), - dataset: option({ + suite: option({ type: optional(string), - long: 'dataset', - description: 'Filter records to a dataset name', + long: 'suite', + description: 'Filter records to a suite name', }), target: option({ type: optional(string), @@ -459,7 +453,7 @@ export const trendCommand = command({ handler: async ({ runs, last, - dataset, + suite, target, slopeThreshold, failOnDegrading, @@ -478,7 +472,7 @@ export const trendCommand = command({ const sourcePaths = resolveTrendSources(process.cwd(), runs, last); const output = analyzeTrend({ sourcePaths, - dataset, + suite, target, slopeThreshold: effectiveSlopeThreshold, allowMissingTests, diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 16708a8a7..9826d2be1 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -417,7 +417,7 @@ describe('buildIndexArtifactEntry', () => { makeResult({ testId: 'alpha', target: 'claude', - dataset: 'demo', + suite: 'demo', scores: [makeEvaluatorResult({ name: 'quality', score: 0.7 })], executionStatus: 'quality_failure', error: 'model drift', @@ -434,7 +434,7 @@ describe('buildIndexArtifactEntry', () => { expect(JSON.parse(JSON.stringify(entry))).toEqual({ timestamp: '2026-03-13T00:00:00.000Z', test_id: 'alpha', - dataset: 'demo', + suite: 'demo', score: 0.9, target: 'claude', scores: [ @@ -699,9 +699,9 @@ describe('writeArtifactsFromResults', () => { expect(grading.assertions[0].text).toBe('baseline-check'); }); - it('prefixes artifact paths with dataset when present', async () => { + it('prefixes artifact paths with suite when present', async () => { const paths = await writeArtifactsFromResults( - [makeResult({ dataset: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], + [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], testDir, ); diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts index feffdef4b..21d5f107d 100644 --- a/apps/cli/test/commands/eval/output-writers.test.ts +++ b/apps/cli/test/commands/eval/output-writers.test.ts @@ -123,12 +123,12 @@ describe('JunitWriter', () => { expect(xml).toContain('score=0.300'); }); - it('should group results by dataset as testsuites', async () => { + it('should group results by suite as testsuites', async () => { const writer = await JunitWriter.open(testFilePath); - await writer.append(makeResult({ testId: 'a-1', dataset: 'suite-a', score: 1.0 })); - await writer.append(makeResult({ testId: 'a-2', dataset: 'suite-a', score: 0.8 })); - await writer.append(makeResult({ testId: 'b-1', dataset: 'suite-b', score: 0.5 })); + await writer.append(makeResult({ testId: 'a-1', suite: 'suite-a', score: 1.0 })); + await writer.append(makeResult({ testId: 'a-2', suite: 'suite-a', score: 0.8 })); + await writer.append(makeResult({ testId: 'b-1', suite: 'suite-b', score: 0.5 })); await writer.close(); const xml = await readFile(testFilePath, 'utf8'); @@ -136,7 +136,7 @@ describe('JunitWriter', () => { expect(xml).toContain('testsuite name="suite-b" tests="1"'); }); - it('should use default suite name when no dataset', async () => { + it('should use default suite name when no suite', async () => { const writer = await JunitWriter.open(testFilePath); await writer.append(makeResult({ testId: 'test-1', score: 1.0 })); await writer.close(); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index 47bba1768..320fd524f 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -23,7 +23,7 @@ import { exportResults } from '../../../src/commands/results/export.js'; const CLAUDE_CLI_RESULT = { timestamp: '2026-03-18T10:00:00.000Z', test_id: 'test-claude-reasoning', - dataset: 'multi-provider', + suite: 'multi-provider', score: 1.0, assertions: [ { text: 'Correct answer', passed: true, evidence: 'Matched expected output' }, @@ -60,7 +60,7 @@ const CLAUDE_CLI_RESULT = { const CODEX_RESULT = { timestamp: '2026-03-18T10:01:00.000Z', test_id: 'test-codex-edit', - dataset: 'multi-provider', + suite: 'multi-provider', score: 0.9, assertions: [ { text: 'File edited correctly', passed: true }, @@ -96,7 +96,7 @@ const CODEX_RESULT = { const COPILOT_RESULT = { timestamp: '2026-03-18T10:02:00.000Z', test_id: 'test-copilot-complete', - dataset: 'multi-provider', + suite: 'multi-provider', score: 0.85, assertions: [ { text: 'Code completion correct', passed: true }, @@ -125,7 +125,7 @@ const COPILOT_RESULT = { const PI_RESULT = { timestamp: '2026-03-18T10:03:00.000Z', test_id: 'test-pi-refactor', - dataset: 'multi-provider', + suite: 'multi-provider', score: 0.75, assertions: [ { text: 'Refactored correctly', passed: true }, @@ -143,7 +143,7 @@ const PI_RESULT = { const LLM_AZURE_RESULT = { timestamp: '2026-03-18T10:04:00.000Z', test_id: 'test-llm-analysis', - dataset: 'multi-provider', + suite: 'multi-provider', score: 1.0, assertions: [{ text: 'Analysis correct', passed: true }], output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }], @@ -166,7 +166,7 @@ const LLM_AZURE_RESULT = { const LLM_GPT_RESULT = { timestamp: '2026-03-18T10:05:00.000Z', test_id: 'test-llm-analysis', - dataset: 'multi-provider', + suite: 'multi-provider', score: 0.8, assertions: [{ text: 'Analysis correct', passed: true }], output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }], @@ -181,7 +181,7 @@ const LLM_GPT_RESULT = { const MINIMAL_RESULT = { timestamp: '2026-03-18T10:06:00.000Z', test_id: 'test-minimal', - dataset: 'multi-provider', + suite: 'multi-provider', score: 0.5, assertions: [{ text: 'Exists', passed: true }], output: [{ role: 'assistant', content: 'Response.' }], @@ -193,7 +193,7 @@ const MINIMAL_RESULT = { const ERROR_RESULT = { timestamp: '2026-03-18T10:07:00.000Z', test_id: 'test-error-case', - dataset: 'multi-provider', + suite: 'multi-provider', score: 0, assertions: [], output: [], @@ -210,9 +210,9 @@ function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } -function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string { +function artifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { const testId = record.test_id ?? 'unknown'; - return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId); + return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId); } describe('export e2e — multi-provider metrics verification', () => { @@ -634,7 +634,7 @@ describe('export e2e — multi-provider metrics verification', () => { const record = { timestamp: '2026-03-18T10:00:00.000Z', test_id: 'test-case-convert', - dataset: 'test', + suite: 'test', score: 1.0, assertions: [{ text: 'ok', passed: true }], output_text: 'ok', diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index 8b123bc57..60d54661a 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -20,7 +20,7 @@ import { const RESULT_FULL = { timestamp: '2026-03-18T10:00:01.000Z', test_id: 'test-greeting', - dataset: 'demo', + suite: 'demo', score: 1.0, assertions: [ { text: 'Says hello', passed: true }, @@ -44,7 +44,7 @@ const RESULT_FULL = { const RESULT_PARTIAL = { timestamp: '2026-03-18T10:00:05.000Z', test_id: 'test-math', - dataset: 'demo', + suite: 'demo', score: 0.5, assertions: [ { text: 'Correct formula', passed: true }, @@ -70,7 +70,7 @@ const RESULT_PARTIAL = { const RESULT_DIFFERENT_TARGET = { timestamp: '2026-03-18T10:00:10.000Z', test_id: 'test-greeting', - dataset: 'demo', + suite: 'demo', score: 0.75, assertions: [ { text: 'Says hello', passed: true }, @@ -85,7 +85,7 @@ const RESULT_DIFFERENT_TARGET = { const RESULT_NO_TRACE = { timestamp: '2026-03-18T10:00:15.000Z', test_id: 'test-simple', - dataset: 'demo', + suite: 'demo', score: 1.0, assertions: [{ text: 'Correct', passed: true }], output: [{ role: 'assistant', content: 'Yes.' }], @@ -99,9 +99,9 @@ function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } -function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string { +function artifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { const testId = record.test_id ?? 'unknown'; - return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId); + return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId); } describe('results export', () => { diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 2d7766622..343625fea 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -10,7 +10,7 @@ import { createApp, loadResults, resolveSourceFile } from '../../../src/commands const RESULT_A = { timestamp: '2026-03-18T10:00:01.000Z', test_id: 'test-greeting', - dataset: 'demo', + suite: 'demo', score: 1.0, assertions: [ { text: 'Says hello', passed: true }, @@ -34,7 +34,7 @@ const RESULT_A = { const RESULT_B = { timestamp: '2026-03-18T10:00:05.000Z', test_id: 'test-math', - dataset: 'demo', + suite: 'demo', score: 0.5, assertions: [ { text: 'Correct formula', passed: true }, diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index 32ea668cb..3f157b893 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -16,7 +16,7 @@ import { const RESULT_WITH_TRACE = JSON.stringify({ timestamp: '2026-02-20T21:38:05.833Z', test_id: 'test-1', - dataset: 'demo', + suite: 'demo', score: 1, assertions: [{ text: 'criterion-1', passed: true }], target: 'default', @@ -34,7 +34,7 @@ const RESULT_WITH_TRACE = JSON.stringify({ const RESULT_WITHOUT_TRACE = JSON.stringify({ timestamp: '2026-02-20T21:38:06.000Z', test_id: 'test-2', - dataset: 'demo', + suite: 'demo', score: 0.75, assertions: [ { text: 'criterion-1', passed: true }, @@ -46,7 +46,7 @@ const RESULT_WITHOUT_TRACE = JSON.stringify({ const RESULT_FAILING = JSON.stringify({ timestamp: '2026-02-20T21:38:07.000Z', test_id: 'test-3', - dataset: 'demo', + suite: 'demo', score: 0, assertions: [ { text: 'criterion-1', passed: false }, diff --git a/apps/cli/test/commands/trend/trend.test.ts b/apps/cli/test/commands/trend/trend.test.ts index 2f32e184e..b29887919 100644 --- a/apps/cli/test/commands/trend/trend.test.ts +++ b/apps/cli/test/commands/trend/trend.test.ts @@ -21,7 +21,7 @@ const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts'); interface RunRecordInput { readonly test_id: string; readonly score: number; - readonly dataset?: string; + readonly suite?: string; readonly target?: string; readonly timestamp?: string; } @@ -55,28 +55,28 @@ describe('trend command', () => { ); }); - it('computes a degrading trend over matched tests after dataset and target filtering', async () => { + it('computes a degrading trend over matched tests after suite and target filtering', async () => { const cwd = await createTempDir(); cleanupDirs.push(cwd); const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ { test_id: 't1', - dataset: 'code-review', + suite: 'code-review', target: 'claude-sonnet', score: 0.95, timestamp: '2026-03-01T10:00:00.000Z', }, { test_id: 't2', - dataset: 'code-review', + suite: 'code-review', target: 'claude-sonnet', score: 0.85, timestamp: '2026-03-01T10:00:00.000Z', }, { test_id: 't1', - dataset: 'code-review', + suite: 'code-review', target: 'gpt-5', score: 0.7, timestamp: '2026-03-01T10:00:00.000Z', @@ -85,21 +85,21 @@ describe('trend command', () => { const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ { test_id: 't1', - dataset: 'code-review', + suite: 'code-review', target: 'claude-sonnet', score: 0.85, timestamp: '2026-03-08T10:00:00.000Z', }, { test_id: 't2', - dataset: 'code-review', + suite: 'code-review', target: 'claude-sonnet', score: 0.75, timestamp: '2026-03-08T10:00:00.000Z', }, { test_id: 't1', - dataset: 'code-review', + suite: 'code-review', target: 'gpt-5', score: 0.8, timestamp: '2026-03-08T10:00:00.000Z', @@ -108,21 +108,21 @@ describe('trend command', () => { const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ { test_id: 't1', - dataset: 'code-review', + suite: 'code-review', target: 'claude-sonnet', score: 0.75, timestamp: '2026-03-15T10:00:00.000Z', }, { test_id: 't2', - dataset: 'code-review', + suite: 'code-review', target: 'claude-sonnet', score: 0.65, timestamp: '2026-03-15T10:00:00.000Z', }, { test_id: 't1', - dataset: 'code-review', + suite: 'code-review', target: 'gpt-5', score: 0.9, timestamp: '2026-03-15T10:00:00.000Z', @@ -131,7 +131,7 @@ describe('trend command', () => { const output = analyzeTrend({ sourcePaths: [run1.indexPath, run2.indexPath, run3.indexPath], - dataset: 'code-review', + suite: 'code-review', target: 'claude-sonnet', slopeThreshold: 0.01, allowMissingTests: false, @@ -155,14 +155,14 @@ describe('trend command', () => { const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.8, timestamp: '2026-03-01T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.6, timestamp: '2026-03-01T10:00:00.000Z', @@ -171,7 +171,7 @@ describe('trend command', () => { const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.9, timestamp: '2026-03-08T10:00:00.000Z', @@ -180,7 +180,7 @@ describe('trend command', () => { const output = analyzeTrend({ sourcePaths: [run1.indexPath, run2.indexPath], - dataset: 'suite', + suite: 'suite', target: 'alpha', slopeThreshold: 0.01, allowMissingTests: true, @@ -199,7 +199,7 @@ describe('trend command', () => { const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.8, timestamp: '2026-03-01T10:00:00.000Z', @@ -208,7 +208,7 @@ describe('trend command', () => { const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'beta', score: 0.7, timestamp: '2026-03-08T10:00:00.000Z', @@ -218,7 +218,7 @@ describe('trend command', () => { expect(() => analyzeTrend({ sourcePaths: [run1.indexPath, run2.indexPath], - dataset: 'suite', + suite: 'suite', target: 'alpha', slopeThreshold: 0.01, allowMissingTests: false, @@ -275,14 +275,14 @@ describe('trend command', () => { const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.9, timestamp: '2026-03-01T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.8, timestamp: '2026-03-01T10:00:00.000Z', @@ -291,14 +291,14 @@ describe('trend command', () => { const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.8, timestamp: '2026-03-08T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.7, timestamp: '2026-03-08T10:00:00.000Z', @@ -307,14 +307,14 @@ describe('trend command', () => { const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.7, timestamp: '2026-03-15T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.6, timestamp: '2026-03-15T10:00:00.000Z', @@ -330,7 +330,7 @@ describe('trend command', () => { run1.runDir, run2.indexPath, run3.runDir, - '--dataset', + '--suite', 'suite', '--target', 'alpha', @@ -342,7 +342,7 @@ describe('trend command', () => { expect(result.exitCode).toBe(0); const parsed = JSON.parse(result.stdout) as Record; expect(parsed.filters).toEqual({ - dataset: 'suite', + suite: 'suite', target: 'alpha', allow_missing_tests: false, }); @@ -357,14 +357,14 @@ describe('trend command', () => { const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.9, timestamp: '2026-03-01T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.8, timestamp: '2026-03-01T10:00:00.000Z', @@ -373,14 +373,14 @@ describe('trend command', () => { const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.8, timestamp: '2026-03-08T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.7, timestamp: '2026-03-08T10:00:00.000Z', @@ -389,14 +389,14 @@ describe('trend command', () => { const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.7, timestamp: '2026-03-15T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.6, timestamp: '2026-03-15T10:00:00.000Z', @@ -405,7 +405,7 @@ describe('trend command', () => { const output = analyzeTrend({ sourcePaths: [run3.runDir, run1.indexPath, run2.runDir], - dataset: 'suite', + suite: 'suite', target: 'alpha', slopeThreshold: 0.01, allowMissingTests: false, @@ -431,14 +431,14 @@ describe('trend command', () => { await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.95, timestamp: '2026-03-01T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.85, timestamp: '2026-03-01T10:00:00.000Z', @@ -447,14 +447,14 @@ describe('trend command', () => { await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.85, timestamp: '2026-03-08T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.75, timestamp: '2026-03-08T10:00:00.000Z', @@ -463,14 +463,14 @@ describe('trend command', () => { await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.75, timestamp: '2026-03-15T10:00:00.000Z', }, { test_id: 't2', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.65, timestamp: '2026-03-15T10:00:00.000Z', @@ -485,7 +485,7 @@ describe('trend command', () => { 'trend', '--last', '3', - '--dataset', + '--suite', 'suite', '--target', 'alpha', @@ -508,7 +508,7 @@ describe('trend command', () => { await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'alpha', score: 0.8, timestamp: '2026-03-01T10:00:00.000Z', @@ -517,7 +517,7 @@ describe('trend command', () => { await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ { test_id: 't1', - dataset: 'suite', + suite: 'suite', target: 'beta', score: 0.7, timestamp: '2026-03-08T10:00:00.000Z', @@ -526,17 +526,7 @@ describe('trend command', () => { const result = await execa( 'bun', - [ - '--no-env-file', - CLI_ENTRY, - 'trend', - '--last', - '2', - '--dataset', - 'suite', - '--target', - 'alpha', - ], + ['--no-env-file', CLI_ENTRY, 'trend', '--last', '2', '--suite', 'suite', '--target', 'alpha'], { cwd, reject: false }, ); diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx index 602abc37e..9dedf70a5 100644 --- a/apps/studio/src/components/Breadcrumbs.tsx +++ b/apps/studio/src/components/Breadcrumbs.tsx @@ -34,9 +34,9 @@ function deriveSegments(matches: ReturnType): BreadcrumbSegme label: params.category ?? 'Category', to: match.pathname, }); - } else if (routeId.includes('/runs/$runId/dataset/$dataset')) { + } else if (routeId.includes('/runs/$runId/suite/$suite')) { segments.push({ - label: params.dataset ?? 'Dataset', + label: params.suite ?? 'Suite', to: match.pathname, }); } else if (routeId.includes('/runs/$runId')) { diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 4c2ea9b9f..6a9a25ee4 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -1,8 +1,8 @@ /** * Run detail component showing per-eval breakdown with score bars. * - * Groups results by category (from file path), then by dataset within each category. - * Categories are shown as collapsible sections with dataset cards inside. + * Groups results by category (from file path), then by suite within each category. + * Categories are shown as collapsible sections with suite cards inside. */ import { Link } from '@tanstack/react-router'; @@ -20,7 +20,7 @@ interface RunDetailProps { projectId?: string; } -interface DatasetStats { +interface SuiteStats { name: string; passed: number; failed: number; @@ -30,7 +30,7 @@ interface DatasetStats { interface CategoryGroup { name: string; - datasets: DatasetStats[]; + suites: SuiteStats[]; total: number; passed: number; failed: number; @@ -45,7 +45,7 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate for (const r of results) { const cat = r.category ?? 'Uncategorized'; - const ds = r.dataset ?? 'Uncategorized'; + const ds = r.suite ?? 'Uncategorized'; if (!categoryMap.has(cat)) categoryMap.set(cat, new Map()); // biome-ignore lint/style/noNonNullAssertion: map entry guaranteed by line above const dsMap = categoryMap.get(cat)!; @@ -59,7 +59,7 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate return Array.from(categoryMap.entries()) .map(([catName, dsMap]) => { - const datasets = Array.from(dsMap.entries()) + const suites = Array.from(dsMap.entries()) .map(([dsName, stats]) => ({ name: dsName, ...stats, @@ -67,14 +67,14 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate })) .sort((a, b) => a.name.localeCompare(b.name)); - const total = datasets.reduce((s, d) => s + d.total, 0); - const passed = datasets.reduce((s, d) => s + d.passed, 0); - const failed = datasets.reduce((s, d) => s + d.failed, 0); - const scoreSum = datasets.reduce((s, d) => s + d.avgScore * d.total, 0); + const total = suites.reduce((s, d) => s + d.total, 0); + const passed = suites.reduce((s, d) => s + d.passed, 0); + const failed = suites.reduce((s, d) => s + d.failed, 0); + const scoreSum = suites.reduce((s, d) => s + d.avgScore * d.total, 0); return { name: catName, - datasets, + suites, total, passed, failed, @@ -128,10 +128,10 @@ export function RunDetail({ results, runId, projectId }: RunDetailProps) { ) : (
-

Datasets

+

Suites

- {categories[0]?.datasets.map((ds) => ( - + {categories[0]?.suites.map((ds) => ( + ))}
@@ -210,7 +210,7 @@ function CategorySection({ category, runId }: { category: CategoryGroup; runId: {expanded ? '\u25BC' : '\u25B6'} {category.name} - {category.datasets.length} dataset{category.datasets.length !== 1 ? 's' : ''} + {category.suites.length} suite{category.suites.length !== 1 ? 's' : ''}
@@ -224,8 +224,8 @@ function CategorySection({ category, runId }: { category: CategoryGroup; runId: {expanded && (
- {category.datasets.map((ds) => ( - + {category.suites.map((ds) => ( + ))}
@@ -234,25 +234,25 @@ function CategorySection({ category, runId }: { category: CategoryGroup; runId: ); } -function DatasetCard({ dataset, runId }: { dataset: DatasetStats; runId: string }) { +function SuiteCard({ suite, runId }: { suite: SuiteStats; runId: string }) { return (
- {dataset.name} + {suite.name} - {dataset.passed}/{dataset.total} + {suite.passed}/{suite.total}
- +
- {dataset.passed} passed - {dataset.failed > 0 && {dataset.failed} failed} + {suite.passed} passed + {suite.failed > 0 && {suite.failed} failed}
); diff --git a/apps/studio/src/components/ScoreBar.tsx b/apps/studio/src/components/ScoreBar.tsx index 368909d09..2c2358c5c 100644 --- a/apps/studio/src/components/ScoreBar.tsx +++ b/apps/studio/src/components/ScoreBar.tsx @@ -2,7 +2,7 @@ * Gradient score bar component. * * Renders a horizontal bar from cyan-400 to blue-500, proportional to the - * score value (0..1). Used in run lists, dataset breakdowns, and eval detail. + * score value (0..1). Used in run lists, suite breakdowns, and eval detail. */ interface ScoreBarProps { diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index eed55444a..fa9f56d8b 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -4,7 +4,7 @@ * Adapts its content based on the current route: * - At root or run detail: shows list of runs * - At eval detail: shows list of evals in the current run with pass/fail indicators - * - At dataset detail: shows evals filtered to that dataset + * - At suite detail: shows evals filtered to that suite * - At experiment detail: shows list of experiments */ @@ -13,7 +13,7 @@ import { Link, useMatchRoute } from '@tanstack/react-router'; import { isPassing, useAllProjectRuns, - useCategoryDatasets, + useCategorySuites, useExperiments, useProjectList, useProjectRunDetail, @@ -68,8 +68,8 @@ export function Sidebar() { to: '/runs/$runId/category/$category', fuzzy: true, }); - const datasetMatch = matchRoute({ - to: '/runs/$runId/dataset/$dataset', + const suiteMatch = matchRoute({ + to: '/runs/$runId/suite/$suite', fuzzy: true, }); const experimentMatch = matchRoute({ @@ -82,9 +82,9 @@ export function Sidebar() { return ; } - if (datasetMatch && typeof datasetMatch === 'object' && 'runId' in datasetMatch) { - const { runId, dataset } = datasetMatch as { runId: string; dataset: string }; - return ; + if (suiteMatch && typeof suiteMatch === 'object' && 'runId' in suiteMatch) { + const { runId, suite } = suiteMatch as { runId: string; suite: string }; + return ; } if (evalMatch && typeof evalMatch === 'object' && 'runId' in evalMatch) { @@ -242,13 +242,11 @@ function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: s ); } -function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string }) { +function SuiteSidebar({ runId, suite }: { runId: string; suite: string }) { const { data } = useRunDetail(runId); const { data: config } = useStudioConfig(); const passThreshold = config?.pass_threshold ?? 0.8; - const datasetResults = (data?.results ?? []).filter( - (r) => (r.dataset ?? 'Uncategorized') === dataset, - ); + const suiteResults = (data?.results ?? []).filter((r) => (r.suite ?? 'Uncategorized') === suite); return (