From 5684e68cd7d137f3c4aff71688f35664ca9ebe15 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Sun, 5 Apr 2026 09:33:13 +0000
Subject: [PATCH 1/4] refactor: rename dataset to suite across codebase (#943)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An eval file is a test suite (lifecycle hooks, workspace setup/teardown,
execution config), not a dataset (passive input/output pairs). This
renames the `dataset` field to `suite` everywhere:

- Core types: EvalTest.suite, EvaluationResult.suite
- Wire format: JSONL results write `suite` field
- CLI: --group-by suite, --suite flag, trace/trend/pipeline commands
- Studio UI: routes /suite/, labels "Suites", API endpoints /suites
- OTel: agentv.suite attribute
- Example baseline JSONL files updated
- Documentation and plugin skill files updated

Hard deprecation — no backward-compat aliases for `dataset`.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/artifact-writer.ts | 12 +--
 apps/cli/src/commands/eval/discover.ts        |  4 +-
 apps/cli/src/commands/eval/junit-writer.ts    |  2 +-
 apps/cli/src/commands/eval/run-eval.ts        | 46 ++++-----
 apps/cli/src/commands/pipeline/bench.ts       |  8 +-
 apps/cli/src/commands/pipeline/grade.ts       |  8 +-
 apps/cli/src/commands/pipeline/input.ts       | 14 +--
 apps/cli/src/commands/pipeline/run.ts         | 16 +--
 apps/cli/src/commands/results/manifest.ts     |  6 +-
 apps/cli/src/commands/results/serve.ts        | 56 +++++------
 apps/cli/src/commands/trace/show.ts           |  2 +-
 apps/cli/src/commands/trace/stats.ts          |  4 +-
 apps/cli/src/commands/trace/utils.ts          |  6 +-
 apps/cli/src/commands/trend/index.ts          | 34 +++----
 .../commands/eval/artifact-writer.test.ts     |  8 +-
 .../test/commands/eval/output-writers.test.ts | 10 +-
 .../results/export-e2e-providers.test.ts      | 22 ++---
 apps/cli/test/commands/results/export.test.ts | 12 +--
 apps/cli/test/commands/results/serve.test.ts  |  4 +-
 apps/cli/test/commands/trace/trace.test.ts    |  6 +-
 apps/cli/test/commands/trend/trend.test.ts    | 98 +++++++++----------
 apps/studio/src/components/Breadcrumbs.tsx    |  4 +-
 apps/studio/src/components/RunDetail.tsx      | 50 +++++-----
 apps/studio/src/components/ScoreBar.tsx       |  2 +-
 apps/studio/src/components/Sidebar.tsx        | 36 ++++---
 apps/studio/src/lib/api.ts                    | 40 ++++----
 apps/studio/src/lib/types.ts                  | 10 +-
 apps/studio/src/routeTree.gen.ts              | 34 +++----
 .../runs/$runId_.category.$category.tsx       | 24 ++---
 ....$dataset.tsx => $runId_.suite.$suite.tsx} | 20 ++--
 .../docs/docs/evaluation/eval-files.mdx       |  4 +-
 .../web/src/content/docs/docs/tools/trace.mdx |  4 +-
 .../web/src/content/docs/docs/tools/trend.mdx | 16 +--
 .../assert/evals/dataset.eval.baseline.jsonl  |  8 +-
 .../evals/dataset.eval.baseline.jsonl         | 14 +--
 .../basic/evals/dataset.eval.baseline.jsonl   | 14 +--
 .../evals/dataset.eval.baseline.jsonl         |  8 +-
 .../evals/dataset.eval.baseline.jsonl         |  2 +-
 .../contextual-precision.eval.baseline.jsonl  |  6 +-
 .../contextual-recall.eval.baseline.jsonl     |  6 +-
 .../evals/dataset.eval.baseline.jsonl         |  8 +-
 .../evals/dataset.eval.baseline.jsonl         |  6 +-
 .../evals/dataset.eval.baseline.jsonl         | 14 +--
 .../confusion-metrics.eval.baseline.jsonl     | 10 +-
 .../evals/field-accuracy.eval.baseline.jsonl  | 12 +--
 .../evals/dataset.eval.baseline.jsonl         | 12 +--
 .../evals/dataset.eval.baseline.jsonl         | 10 +-
 .../evals/dataset.eval.baseline.jsonl         |  4 +-
 .../evals/dataset.eval.baseline.jsonl         |  2 +-
 .../evals/dataset.eval.baseline.jsonl         | 10 +-
 .../evals/dataset.eval.baseline.jsonl         |  2 +-
 .../evals/dataset.eval.baseline.jsonl         | 10 +-
 .../evals/dataset.eval.baseline.jsonl         |  4 +-
 .../evals/dataset.eval.baseline.jsonl         | 10 +-
 .../evals/dataset.eval.baseline.jsonl         |  4 +-
 .../rubric/evals/dataset.eval.baseline.jsonl  | 10 +-
 .../evals/dataset.eval.baseline.jsonl         |  4 +-
 .../evals/dataset.eval.baseline.jsonl         |  6 +-
 .../evals/dataset.eval.baseline.jsonl         |  6 +-
 .../evals/dataset.eval.baseline.jsonl         |  2 +-
 .../evals/dataset.eval.baseline.jsonl         |  6 +-
 .../evals/trace-file-demo.eval.baseline.jsonl | 12 +--
 .../evals/dataset.eval.baseline.jsonl         | 14 +--
 .../evals/multi-agent.eval.results.jsonl      | 10 +-
 .../evals/dataset.eval.baseline.jsonl         | 10 +-
 examples/features/trend/README.md             | 14 +--
 .../2026-03-01T10-00-00-000Z/index.jsonl      |  6 +-
 .../2026-03-08T10-00-00-000Z/index.jsonl      |  6 +-
 .../2026-03-15T10-00-00-000Z/index.jsonl      |  6 +-
 .../trials/evals/dataset.eval.baseline.jsonl  |  4 +-
 .../evals/dataset.eval.baseline.jsonl         |  6 +-
 .../evals/dataset.eval.baseline.jsonl         | 16 +--
 .../evals/dataset.eval.baseline.jsonl         | 44 ++++-----
 .../fixtures/setup-a.raw.jsonl                | 10 +-
 .../fixtures/setup-b.raw.jsonl                | 10 +-
 .../evals/encouragement.eval.baseline.jsonl   | 16 +--
 .../evals/listening.eval.baseline.jsonl       | 12 +--
 .../evals/routing.eval.baseline.jsonl         |  8 +-
 .../tool-eval-demo.baseline.jsonl             |  8 +-
 .../src/evaluation/loaders/jsonl-parser.ts    | 16 +--
 packages/core/src/evaluation/orchestrator.ts  |  8 +-
 packages/core/src/evaluation/types.ts         |  4 +-
 .../evaluation/validation/eval-file.schema.ts |  2 +-
 .../evaluation/validation/eval-validator.ts   |  2 +-
 packages/core/src/evaluation/yaml-parser.ts   | 12 +--
 .../core/src/observability/otel-exporter.ts   |  4 +-
 .../core/test/evaluation/baseline.test.ts     |  4 +-
 .../code-evaluator-file-backed.test.ts        |  2 +-
 .../code-evaluator-multimodal.test.ts         |  2 +-
 .../core/test/evaluation/evaluators.test.ts   |  2 +-
 .../evaluators/composite-threshold.test.ts    |  2 +-
 .../evaluators/execution-metrics.test.ts      |  2 +-
 .../evaluation/evaluators_variables.test.ts   |  2 +-
 .../test/evaluation/execution-metrics.test.ts |  2 +-
 .../test/evaluation/execution-status.test.ts  |  2 +-
 .../evaluation/llm-grader-multimodal.test.ts  |  2 +-
 .../evaluation/loaders/jsonl-parser.test.ts   | 10 +-
 .../core/test/evaluation/orchestrator.test.ts | 14 +--
 .../observability/streaming-observer.test.ts  |  6 +-
 .../skills/agentv-bench/agents/analyzer.md    |  2 +-
 .../skills/agentv-eval-writer/SKILL.md        |  2 +-
 .../references/eval-schema.json               |  4 +-
 .../skills/agentv-trace-analyst/SKILL.md      | 12 +--
 103 files changed, 572 insertions(+), 592 deletions(-)
 rename apps/studio/src/routes/runs/{$runId_.dataset.$dataset.tsx => $runId_.suite.$suite.tsx} (88%)

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 4c072d661..03c2e901f 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -94,7 +94,7 @@ export interface AggregateGradingArtifact {
 export interface IndexArtifactEntry {
   readonly timestamp: string;
   readonly test_id: string;
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly category?: string;
   readonly conversation_id?: string;
   readonly score: number;
@@ -459,13 +459,13 @@ function safeTestId(testId: string | undefined): string {
   return safeArtifactPathSegment(testId, 'unknown');
 }
 
-function getDataset(result: EvaluationResult): string | undefined {
-  return result.dataset;
+function getSuite(result: EvaluationResult): string | undefined {
+  return result.suite;
 }
 
 function buildArtifactSubdir(result: EvaluationResult): string {
   const segments = [];
-  const evalSet = getDataset(result);
+  const evalSet = getSuite(result);
   if (evalSet) {
     segments.push(safeArtifactPathSegment(evalSet, 'default'));
   }
@@ -504,7 +504,7 @@ export function buildIndexArtifactEntry(
   return {
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
-    dataset: getDataset(result),
+    suite: getSuite(result),
     category: result.category,
     conversation_id: result.conversationId,
     score: result.score,
@@ -536,7 +536,7 @@ export function buildResultIndexArtifact(result: EvaluationResult): ResultIndexA
   return {
     timestamp: result.timestamp,
     test_id: result.testId ?? 'unknown',
-    dataset: getDataset(result),
+    suite: getSuite(result),
     category: result.category,
     conversation_id: result.conversationId,
     score: result.score,
diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts
index f8ea59e19..cbb57c44b 100644
--- a/apps/cli/src/commands/eval/discover.ts
+++ b/apps/cli/src/commands/eval/discover.ts
@@ -17,8 +17,8 @@ export interface DiscoveredEvalFile {
  * Discover eval files by glob pattern matching.
  *
  * Uses `eval_patterns` from `.agentv/config.yaml` if configured,
- * otherwise falls back to default patterns that match `dataset*.yaml`
- * and `eval.yaml` files under `evals/` directories.
+ * otherwise falls back to default patterns that match `suite*.yaml`,
+ * `eval.yaml`, and legacy `dataset*.yaml` files under `evals/` directories.
  */
 export async function discoverEvalFiles(cwd: string): Promise<readonly DiscoveredEvalFile[]> {
   const repoRoot = await findRepoRoot(cwd);
diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts
index c53beca45..7e8e88d95 100644
--- a/apps/cli/src/commands/eval/junit-writer.ts
+++ b/apps/cli/src/commands/eval/junit-writer.ts
@@ -47,7 +47,7 @@ export class JunitWriter {
 
     const grouped = new Map<string, EvaluationResult[]>();
     for (const result of this.results) {
-      const suite = result.dataset ?? 'default';
+      const suite = result.suite ?? 'default';
       const existing = grouped.get(suite);
       if (existing) {
         existing.push(result);
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 53544e4e0..61aab86a6 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -480,7 +480,7 @@ async function prepareFileMetadata(params: {
   readonly testCases: readonly EvalTest[];
   readonly selections: readonly { selection: TargetSelection; inlineTargetLabel: string }[];
   readonly trialsConfig?: TrialsConfig;
-  readonly datasetTargets?: readonly string[];
+  readonly suiteTargets?: readonly string[];
   readonly yamlWorkers?: number;
   readonly yamlCache?: boolean;
   readonly yamlCachePath?: string;
@@ -501,23 +501,23 @@ async function prepareFileMetadata(params: {
   const relativePath = path.relative(cwd, testFilePath);
   const category = deriveCategory(relativePath);
 
-  const dataset = await loadTestSuite(testFilePath, repoRoot, {
+  const suite = await loadTestSuite(testFilePath, repoRoot, {
     verbose: options.verbose,
     filter: options.filter,
     category,
   });
-  const testIds = dataset.tests.map((value) => value.id);
+  const testIds = suite.tests.map((value) => value.id);
 
   // Determine target names: CLI --target flags override YAML
   const cliTargets = options.cliTargets;
-  const datasetTargets = dataset.targets;
+  const suiteTargets = suite.targets;
 
-  // Resolve which target names to use (precedence: CLI > dataset YAML targets > default)
+  // Resolve which target names to use (precedence: CLI > suite YAML targets > default)
   let targetNames: readonly string[];
   if (cliTargets.length > 0) {
     targetNames = cliTargets;
-  } else if (datasetTargets && datasetTargets.length > 0) {
-    targetNames = datasetTargets;
+  } else if (suiteTargets && suiteTargets.length > 0) {
+    targetNames = suiteTargets;
   } else {
     targetNames = [];
   }
@@ -568,17 +568,17 @@ async function prepareFileMetadata(params: {
 
   return {
     testIds,
-    testCases: dataset.tests,
+    testCases: suite.tests,
     selections,
-    trialsConfig: dataset.trials,
-    datasetTargets,
-    yamlWorkers: dataset.workers,
-    yamlCache: dataset.cacheConfig?.enabled,
-    yamlCachePath: dataset.cacheConfig?.cachePath,
-    totalBudgetUsd: dataset.totalBudgetUsd,
-    failOnError: dataset.failOnError,
-    threshold: dataset.threshold,
-    tags: dataset.metadata?.tags,
+    trialsConfig: suite.trials,
+    suiteTargets,
+    yamlWorkers: suite.workers,
+    yamlCache: suite.cacheConfig?.enabled,
+    yamlCachePath: suite.cacheConfig?.cachePath,
+    totalBudgetUsd: suite.totalBudgetUsd,
+    failOnError: suite.failOnError,
+    threshold: suite.threshold,
+    tags: suite.metadata?.tags,
   };
 }
 
@@ -1021,7 +1021,7 @@ export async function runEvalCommand(
         inlineTargetLabel: string;
       }[];
       readonly trialsConfig?: TrialsConfig;
-      readonly datasetTargets?: readonly string[];
+      readonly suiteTargets?: readonly string[];
       readonly yamlWorkers?: number;
       readonly yamlCache?: boolean;
       readonly yamlCachePath?: string;
@@ -1104,7 +1104,7 @@ export async function runEvalCommand(
     console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
   }
 
-  // Resolve dataset-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
+  // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
   const yamlThreshold = firstMeta?.threshold;
   const resolvedThreshold = options.threshold ?? yamlThreshold;
   if (resolvedThreshold !== undefined && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
@@ -1128,13 +1128,13 @@ export async function runEvalCommand(
   // In matrix mode, total eval count is tests × targets (accounting for per-test target overrides)
   let totalEvalCount = 0;
   for (const meta of fileMetadata.values()) {
-    const datasetTargetNames = meta.selections.map((s) => s.selection.targetName);
+    const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
     for (const test of meta.testCases) {
-      // Per-test targets override dataset-level targets.
+      // Per-test targets override suite-level targets.
       const testTargetNames =
         test.targets && test.targets.length > 0
-          ? test.targets.filter((t) => datasetTargetNames.includes(t))
-          : datasetTargetNames;
+          ? test.targets.filter((t) => suiteTargetNames.includes(t))
+          : suiteTargetNames;
       totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
     }
   }
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
index ee355c5b2..cb4bcc4ad 100644
--- a/apps/cli/src/commands/pipeline/bench.ts
+++ b/apps/cli/src/commands/pipeline/bench.ts
@@ -37,15 +37,15 @@ export const evalBenchCommand = command({
     const manifest = JSON.parse(await readFile(join(exportDir, 'manifest.json'), 'utf8'));
     const testIds: string[] = manifest.test_ids;
     const targetName: string = manifest.target?.name ?? 'unknown';
-    const datasetName: string = manifest.dataset ?? '';
+    const suiteName: string = manifest.suite ?? '';
     const experiment: string | undefined = manifest.experiment;
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const indexLines: string[] = [];
     const allPassRates: number[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+      const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
       const testDir = join(exportDir, ...subpath);
       const artifactSubdir = subpath.join('/');
       const evaluators: EvaluatorScore[] = [];
@@ -177,7 +177,7 @@ export const evalBenchCommand = command({
         JSON.stringify({
           timestamp: manifest.timestamp,
           test_id: testId,
-          dataset: datasetName || undefined,
+          suite: suiteName || undefined,
           experiment: experiment || undefined,
           score: Math.round(weightedScore * 1000) / 1000,
           target: targetName,
diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
index 45faa8608..b9263c399 100644
--- a/apps/cli/src/commands/pipeline/grade.ts
+++ b/apps/cli/src/commands/pipeline/grade.ts
@@ -10,7 +10,7 @@
  * Progress is printed to stderr so users see real-time feedback.
  *
  * Export directory additions:
- *   <out-dir>/<dataset>/<test-id>/code_grader_results/<name>.json
+ *   <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
  */
 import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
@@ -196,14 +196,14 @@ export const evalGradeCommand = command({
     const manifestPath = join(exportDir, 'manifest.json');
     const manifest = JSON.parse(await readFile(manifestPath, 'utf8'));
     const testIds: string[] = manifest.test_ids;
-    const datasetName: string = manifest.dataset ?? '';
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const suiteName: string = manifest.suite ?? '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     // Collect all grader tasks upfront so we know the total count
     const tasks: GraderTask[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+      const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
       const testDir = join(exportDir, ...subpath);
       const codeGradersDir = join(testDir, 'code_graders');
       const resultsDir = join(testDir, 'code_grader_results');
diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
index 28b43b391..3eb7ad0a4 100644
--- a/apps/cli/src/commands/pipeline/input.ts
+++ b/apps/cli/src/commands/pipeline/input.ts
@@ -9,7 +9,7 @@
  * Export directory layout:
  *   <out-dir>/
  *   ├── manifest.json
- *   └── <dataset>/               (omitted if eval.yaml has no name)
+ *   └── <suite>/                (omitted if eval.yaml has no name)
  *       └── <test-id>/
  *           ├── input.json
  *           ├── invoke.json
@@ -58,8 +58,8 @@ export const evalInputCommand = command({
     const evalDir = dirname(resolvedEvalPath);
 
     const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
-    const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
-    const tests = dataset.tests;
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
+    const tests = suite.tests;
 
     if (tests.length === 0) {
       console.error('No tests found in eval file.');
@@ -107,13 +107,13 @@ export const evalInputCommand = command({
       // No targets file found — subagent-as-target mode
     }
 
-    const datasetName = dataset.metadata?.name?.trim() ?? '';
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const suiteName = suite.metadata?.name?.trim() ?? '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];
 
     for (const test of tests) {
-      const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
+      const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id];
       const testDir = join(outDir, ...subpath);
       await mkdir(testDir, { recursive: true });
       testIds.push(test.id);
@@ -168,7 +168,7 @@ export const evalInputCommand = command({
     // manifest.json
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      dataset: datasetName || undefined,
+      suite: suiteName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: {
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index 372bfd04f..f91db3dad 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -100,8 +100,8 @@ export const evalRunCommand = command({
 
     // ── Step 1: Extract inputs (same as pipeline input) ──────────────
     const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
-    const dataset = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
-    const tests = dataset.tests;
+    const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
+    const tests = suite.tests;
 
     if (tests.length === 0) {
       console.error('No tests found in eval file.');
@@ -145,13 +145,13 @@ export const evalRunCommand = command({
       // No targets file — subagent-as-target mode
     }
 
-    const datasetName = dataset.metadata?.name?.trim() ?? '';
-    const safeDatasetName = datasetName ? datasetName.replace(/[\/\\:*?"<>|]/g, '_') : '';
+    const suiteName = suite.metadata?.name?.trim() ?? '';
+    const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];
 
     for (const test of tests) {
-      const subpath = safeDatasetName ? [safeDatasetName, test.id] : [test.id];
+      const subpath = safeSuiteName ? [safeSuiteName, test.id] : [test.id];
       const testDir = join(outDir, ...subpath);
       await mkdir(testDir, { recursive: true });
       testIds.push(test.id);
@@ -198,7 +198,7 @@ export const evalRunCommand = command({
 
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
-      dataset: datasetName || undefined,
+      suite: suiteName || undefined,
       experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: { name: targetName, kind: targetKind },
@@ -230,7 +230,7 @@ export const evalRunCommand = command({
       writeInvProgress();
 
       const invokeTarget = async (testId: string): Promise<void> => {
-        const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+        const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
         const testDir = join(outDir, ...subpath);
         const invoke = JSON.parse(await readFile(join(testDir, 'invoke.json'), 'utf8'));
         if (invoke.kind !== 'cli') return;
@@ -341,7 +341,7 @@ export const evalRunCommand = command({
     const graderTasks: GraderTask[] = [];
 
     for (const testId of testIds) {
-      const subpath = safeDatasetName ? [safeDatasetName, testId] : [testId];
+      const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
       const testDir = join(outDir, ...subpath);
       const codeGradersDir = join(testDir, 'code_graders');
       const resultsDir = join(testDir, 'code_grader_results');
diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
index cffb4760a..98e8a5527 100644
--- a/apps/cli/src/commands/results/manifest.ts
+++ b/apps/cli/src/commands/results/manifest.ts
@@ -13,7 +13,7 @@ import {
 export interface ResultManifestRecord {
   readonly timestamp?: string;
   readonly test_id?: string;
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly category?: string;
   readonly experiment?: string;
   readonly target?: string;
@@ -123,7 +123,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
   return {
     timestamp: record.timestamp,
     testId,
-    dataset: record.dataset,
+    suite: record.suite,
     category: record.category,
     target: record.target,
     score: record.score,
@@ -189,6 +189,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] {
 
 export interface LightweightResultRecord {
   readonly testId: string;
+  readonly suite?: string;
   readonly target?: string;
   readonly experiment?: string;
   readonly score: number;
@@ -203,6 +204,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
   const content = readFileSync(resolvedSourceFile, 'utf8');
   return parseResultManifest(content).map((record) => ({
     testId: record.test_id ?? 'unknown',
+    suite: record.suite,
     target: record.target,
     experiment: record.experiment,
     score: record.score,
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index e00e7e837..92f2b20d5 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -14,7 +14,7 @@
  *   - GET /api/projects  — list registered projects
  *   - GET /api/projects/:projectId/runs — project-scoped run list
  *
- * All data routes (runs, datasets, categories, evals, experiments, targets)
+ * All data routes (runs, suites, categories, evals, experiments, targets)
  * exist in both unscoped (/api/...) and project-scoped (/api/projects/:projectId/...)
  * variants. They share handler functions via DataContext, differing only in
  * how searchDir is resolved.
@@ -275,32 +275,32 @@ function handleRunDetail(c: C, { searchDir }: DataContext) {
   }
 }
 
-function handleRunDatasets(c: C, { searchDir, agentvDir }: DataContext) {
+function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) {
   const filename = c.req.param('filename');
   const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
   if (!meta) return c.json({ error: 'Run not found' }, 404);
   try {
     const loaded = loadManifestResults(meta.path);
     const { pass_threshold } = loadStudioConfig(agentvDir);
-    const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
+    const suiteMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
     for (const r of loaded) {
-      const ds = r.dataset ?? r.target ?? 'default';
-      const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
+      const ds = r.suite ?? r.target ?? 'default';
+      const entry = suiteMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
       entry.total++;
       if (r.score >= pass_threshold) entry.passed++;
       entry.scoreSum += r.score;
-      datasetMap.set(ds, entry);
+      suiteMap.set(ds, entry);
     }
-    const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
+    const suites = [...suiteMap.entries()].map(([name, entry]) => ({
       name,
       total: entry.total,
       passed: entry.passed,
       failed: entry.total - entry.passed,
       avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
     }));
-    return c.json({ datasets });
+    return c.json({ suites });
   } catch {
-    return c.json({ error: 'Failed to load datasets' }, 500);
+    return c.json({ error: 'Failed to load suites' }, 500);
   }
 }
 
@@ -313,7 +313,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) {
     const { pass_threshold } = loadStudioConfig(agentvDir);
     const categoryMap = new Map<
       string,
-      { total: number; passed: number; scoreSum: number; datasets: Set<string> }
+      { total: number; passed: number; scoreSum: number; suites: Set<string> }
     >();
     for (const r of loaded) {
       const cat = r.category ?? DEFAULT_CATEGORY;
@@ -321,12 +321,12 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) {
         total: 0,
         passed: 0,
         scoreSum: 0,
-        datasets: new Set<string>(),
+        suites: new Set<string>(),
       };
       entry.total++;
       if (r.score >= pass_threshold) entry.passed++;
       entry.scoreSum += r.score;
-      entry.datasets.add(r.dataset ?? r.target ?? 'default');
+      entry.suites.add(r.suite ?? r.target ?? 'default');
       categoryMap.set(cat, entry);
     }
     const categories = [...categoryMap.entries()].map(([name, entry]) => ({
@@ -335,7 +335,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) {
       passed: entry.passed,
       failed: entry.total - entry.passed,
       avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
-      dataset_count: entry.datasets.size,
+      suite_count: entry.suites.size,
     }));
     return c.json({ categories });
   } catch {
@@ -343,7 +343,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) {
   }
 }
 
-function handleCategoryDatasets(c: C, { searchDir, agentvDir }: DataContext) {
+function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) {
   const filename = c.req.param('filename');
   const category = decodeURIComponent(c.req.param('category') ?? '');
   const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
@@ -352,25 +352,25 @@ function handleCategoryDatasets(c: C, { searchDir, agentvDir }: DataContext) {
     const loaded = loadManifestResults(meta.path);
     const { pass_threshold } = loadStudioConfig(agentvDir);
     const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
-    const datasetMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
+    const suiteMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
     for (const r of filtered) {
-      const ds = r.dataset ?? r.target ?? 'default';
-      const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
+      const ds = r.suite ?? r.target ?? 'default';
+      const entry = suiteMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
       entry.total++;
       if (r.score >= pass_threshold) entry.passed++;
       entry.scoreSum += r.score;
-      datasetMap.set(ds, entry);
+      suiteMap.set(ds, entry);
     }
-    const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
+    const suites = [...suiteMap.entries()].map(([name, entry]) => ({
       name,
       total: entry.total,
       passed: entry.passed,
       failed: entry.total - entry.passed,
       avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
     }));
-    return c.json({ datasets });
+    return c.json({ suites });
   } catch {
-    return c.json({ error: 'Failed to load datasets' }, 500);
+    return c.json({ error: 'Failed to load suites' }, 500);
   }
 }
 
@@ -780,10 +780,10 @@ export function createApp(
   app.get('/api/config', (c) => handleConfig(c, defaultCtx));
   app.get('/api/runs', (c) => handleRuns(c, defaultCtx));
   app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx));
-  app.get('/api/runs/:filename/datasets', (c) => handleRunDatasets(c, defaultCtx));
+  app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx));
   app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx));
-  app.get('/api/runs/:filename/categories/:category/datasets', (c) =>
-    handleCategoryDatasets(c, defaultCtx),
+  app.get('/api/runs/:filename/categories/:category/suites', (c) =>
+    handleCategorySuites(c, defaultCtx),
   );
   app.get('/api/runs/:filename/evals/:evalId', (c) => handleEvalDetail(c, defaultCtx));
   app.get('/api/runs/:filename/evals/:evalId/files', (c) => handleEvalFiles(c, defaultCtx));
@@ -872,14 +872,12 @@ export function createApp(
   app.get('/api/projects/:projectId/config', (c) => withProject(c, handleConfig));
   app.get('/api/projects/:projectId/runs', (c) => withProject(c, handleRuns));
   app.get('/api/projects/:projectId/runs/:filename', (c) => withProject(c, handleRunDetail));
-  app.get('/api/projects/:projectId/runs/:filename/datasets', (c) =>
-    withProject(c, handleRunDatasets),
-  );
+  app.get('/api/projects/:projectId/runs/:filename/suites', (c) => withProject(c, handleRunSuites));
   app.get('/api/projects/:projectId/runs/:filename/categories', (c) =>
     withProject(c, handleRunCategories),
   );
-  app.get('/api/projects/:projectId/runs/:filename/categories/:category/datasets', (c) =>
-    withProject(c, handleCategoryDatasets),
+  app.get('/api/projects/:projectId/runs/:filename/categories/:category/suites', (c) =>
+    withProject(c, handleCategorySuites),
   );
   app.get('/api/projects/:projectId/runs/:filename/evals/:evalId', (c) =>
     withProject(c, handleEvalDetail),
diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/trace/show.ts
index 598a4753f..50e12f7e7 100644
--- a/apps/cli/src/commands/trace/show.ts
+++ b/apps/cli/src/commands/trace/show.ts
@@ -225,7 +225,7 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st
   // Standard flat view
   const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red;
   lines.push(
-    `${c.bold}${testId}${c.reset}  ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? `  ${c.dim}target: ${result.target}${c.reset}` : ''}${result.dataset ? `  ${c.dim}dataset: ${result.dataset}${c.reset}` : ''}`,
+    `${c.bold}${testId}${c.reset}  ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? `  ${c.dim}target: ${result.target}${c.reset}` : ''}${result.suite ? `  ${c.dim}suite: ${result.suite}${c.reset}` : ''}`,
   );
 
   if (result.error) {
diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/trace/stats.ts
index 6a88d10d0..cf3df312c 100644
--- a/apps/cli/src/commands/trace/stats.ts
+++ b/apps/cli/src/commands/trace/stats.ts
@@ -109,8 +109,8 @@ function groupResults(results: RawResult[], groupBy?: string): GroupedResults[]
       case 'target':
         key = result.target ?? 'unknown';
         break;
-      case 'dataset':
-        key = result.dataset ?? 'unknown';
+      case 'suite':
+        key = result.suite ?? 'unknown';
         break;
       case 'test-id':
         key = result.test_id ?? result.eval_id ?? 'unknown';
diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts
index 443a1466f..f10a97ab4 100644
--- a/apps/cli/src/commands/trace/utils.ts
+++ b/apps/cli/src/commands/trace/utils.ts
@@ -51,7 +51,7 @@ export interface RawResult {
   timestamp?: string;
   test_id?: string;
   eval_id?: string;
-  dataset?: string;
+  suite?: string;
   conversation_id?: string;
   score: number;
   assertions?: { text: string; passed: boolean; evidence?: string }[];
@@ -149,7 +149,7 @@ function toRawResult(result: EvaluationResult): RawResult {
   return {
     timestamp: result.timestamp,
     test_id: result.testId,
-    dataset: result.dataset,
+    suite: result.suite,
     conversation_id: result.conversationId,
     score: result.score,
     assertions: result.assertions?.map((assertion) => ({
@@ -334,7 +334,7 @@ function loadOtlpTraceFile(filePath: string): RawResult[] {
         stringAttr(rootAttrs.agentv_test_id) ??
         stringAttr(rootAttrs.agentv_eval_id) ??
         `trace-${index + 1}`,
-      dataset: stringAttr(rootAttrs.agentv_dataset),
+      suite: stringAttr(rootAttrs.agentv_suite),
       target: stringAttr(rootAttrs.agentv_target),
       score,
       error: root.status?.code === 2 ? root.status.message : undefined,
diff --git a/apps/cli/src/commands/trend/index.ts b/apps/cli/src/commands/trend/index.ts
index edd616d77..7ef5218ff 100644
--- a/apps/cli/src/commands/trend/index.ts
+++ b/apps/cli/src/commands/trend/index.ts
@@ -39,7 +39,7 @@ export interface TrendRunPoint {
 }
 
 export interface TrendFilters {
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly target?: string;
   readonly allowMissingTests: boolean;
 }
@@ -163,11 +163,11 @@ export function resolveTrendSources(
 function filterRunRecords(
   records: readonly LightweightResultRecord[],
   sourcePath: string,
-  dataset?: string,
+  suite?: string,
   target?: string,
 ): TrendRunRecord[] {
   return records
-    .filter((record) => (dataset ? record.dataset === dataset : true))
+    .filter((record) => (suite ? record.suite === suite : true))
     .filter((record) => (target ? record.target === target : true))
     .map((record) => ({ ...record, sourcePath }));
 }
@@ -268,28 +268,22 @@ export function determineTrendExitCode(
 
 export function analyzeTrend(params: {
   readonly sourcePaths: readonly string[];
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly target?: string;
   readonly slopeThreshold: number;
   readonly allowMissingTests: boolean;
   readonly failOnDegrading: boolean;
 }): TrendOutput {
-  const { sourcePaths, dataset, target, slopeThreshold, allowMissingTests, failOnDegrading } =
-    params;
+  const { sourcePaths, suite, target, slopeThreshold, allowMissingTests, failOnDegrading } = params;
 
   if (sourcePaths.length < 2) {
     throw new Error('Trend analysis requires at least 2 runs');
   }
 
   const filteredRuns = sourcePaths.map((sourcePath) => {
-    const records = filterRunRecords(
-      loadLightweightResults(sourcePath),
-      sourcePath,
-      dataset,
-      target,
-    );
+    const records = filterRunRecords(loadLightweightResults(sourcePath), sourcePath, suite, target);
     if (records.length === 0) {
-      const filters = [dataset ? `dataset=${dataset}` : '', target ? `target=${target}` : '']
+      const filters = [suite ? `suite=${suite}` : '', target ? `target=${target}` : '']
         .filter(Boolean)
         .join(', ');
       const suffix = filters ? ` after filtering by ${filters}` : '';
@@ -339,7 +333,7 @@ export function analyzeTrend(params: {
   return {
     runs,
     filters: {
-      dataset,
+      suite,
       target,
       allowMissingTests,
     },
@@ -377,7 +371,7 @@ export function formatTrendTable(output: TrendOutput): string {
     `${c.bold}Runs:${c.reset} ${output.summary.runCount} | ${c.bold}Range:${c.reset} ${output.summary.dateRange.start ?? 'unknown'} → ${output.summary.dateRange.end ?? 'unknown'}`,
   );
   lines.push(
-    `${c.bold}Filters:${c.reset} dataset=${output.filters.dataset ?? '*'} target=${output.filters.target ?? '*'} mode=${output.filters.allowMissingTests ? 'independent' : 'matched-tests'}`,
+    `${c.bold}Filters:${c.reset} suite=${output.filters.suite ?? '*'} target=${output.filters.target ?? '*'} mode=${output.filters.allowMissingTests ? 'independent' : 'matched-tests'}`,
   );
   lines.push(
     `${c.bold}Matched Tests:${c.reset} ${output.summary.matchedTestCount} | ${c.bold}Verdict:${c.reset} ${colorizeDirection(output.summary.direction)}`,
@@ -422,10 +416,10 @@ export const trendCommand = command({
       long: 'last',
       description: 'Use the most recent N runs from .agentv/results/runs/',
     }),
-    dataset: option({
+    suite: option({
       type: optional(string),
-      long: 'dataset',
-      description: 'Filter records to a dataset name',
+      long: 'suite',
+      description: 'Filter records to a suite name',
     }),
     target: option({
       type: optional(string),
@@ -459,7 +453,7 @@ export const trendCommand = command({
   handler: async ({
     runs,
     last,
-    dataset,
+    suite,
     target,
     slopeThreshold,
     failOnDegrading,
@@ -478,7 +472,7 @@ export const trendCommand = command({
       const sourcePaths = resolveTrendSources(process.cwd(), runs, last);
       const output = analyzeTrend({
         sourcePaths,
-        dataset,
+        suite,
         target,
         slopeThreshold: effectiveSlopeThreshold,
         allowMissingTests,
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 16708a8a7..9826d2be1 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -417,7 +417,7 @@ describe('buildIndexArtifactEntry', () => {
       makeResult({
         testId: 'alpha',
         target: 'claude',
-        dataset: 'demo',
+        suite: 'demo',
         scores: [makeEvaluatorResult({ name: 'quality', score: 0.7 })],
         executionStatus: 'quality_failure',
         error: 'model drift',
@@ -434,7 +434,7 @@ describe('buildIndexArtifactEntry', () => {
     expect(JSON.parse(JSON.stringify(entry))).toEqual({
       timestamp: '2026-03-13T00:00:00.000Z',
       test_id: 'alpha',
-      dataset: 'demo',
+      suite: 'demo',
       score: 0.9,
       target: 'claude',
       scores: [
@@ -699,9 +699,9 @@ describe('writeArtifactsFromResults', () => {
     expect(grading.assertions[0].text).toBe('baseline-check');
   });
 
-  it('prefixes artifact paths with dataset when present', async () => {
+  it('prefixes artifact paths with suite when present', async () => {
     const paths = await writeArtifactsFromResults(
-      [makeResult({ dataset: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
+      [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })],
       testDir,
     );
 
diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts
index feffdef4b..21d5f107d 100644
--- a/apps/cli/test/commands/eval/output-writers.test.ts
+++ b/apps/cli/test/commands/eval/output-writers.test.ts
@@ -123,12 +123,12 @@ describe('JunitWriter', () => {
     expect(xml).toContain('score=0.300');
   });
 
-  it('should group results by dataset as testsuites', async () => {
+  it('should group results by suite as testsuites', async () => {
     const writer = await JunitWriter.open(testFilePath);
 
-    await writer.append(makeResult({ testId: 'a-1', dataset: 'suite-a', score: 1.0 }));
-    await writer.append(makeResult({ testId: 'a-2', dataset: 'suite-a', score: 0.8 }));
-    await writer.append(makeResult({ testId: 'b-1', dataset: 'suite-b', score: 0.5 }));
+    await writer.append(makeResult({ testId: 'a-1', suite: 'suite-a', score: 1.0 }));
+    await writer.append(makeResult({ testId: 'a-2', suite: 'suite-a', score: 0.8 }));
+    await writer.append(makeResult({ testId: 'b-1', suite: 'suite-b', score: 0.5 }));
     await writer.close();
 
     const xml = await readFile(testFilePath, 'utf8');
@@ -136,7 +136,7 @@ describe('JunitWriter', () => {
     expect(xml).toContain('testsuite name="suite-b" tests="1"');
   });
 
-  it('should use default suite name when no dataset', async () => {
+  it('should use default suite name when no suite', async () => {
     const writer = await JunitWriter.open(testFilePath);
     await writer.append(makeResult({ testId: 'test-1', score: 1.0 }));
     await writer.close();
diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts
index 47bba1768..320fd524f 100644
--- a/apps/cli/test/commands/results/export-e2e-providers.test.ts
+++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts
@@ -23,7 +23,7 @@ import { exportResults } from '../../../src/commands/results/export.js';
 const CLAUDE_CLI_RESULT = {
   timestamp: '2026-03-18T10:00:00.000Z',
   test_id: 'test-claude-reasoning',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 1.0,
   assertions: [
     { text: 'Correct answer', passed: true, evidence: 'Matched expected output' },
@@ -60,7 +60,7 @@ const CLAUDE_CLI_RESULT = {
 const CODEX_RESULT = {
   timestamp: '2026-03-18T10:01:00.000Z',
   test_id: 'test-codex-edit',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 0.9,
   assertions: [
     { text: 'File edited correctly', passed: true },
@@ -96,7 +96,7 @@ const CODEX_RESULT = {
 const COPILOT_RESULT = {
   timestamp: '2026-03-18T10:02:00.000Z',
   test_id: 'test-copilot-complete',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 0.85,
   assertions: [
     { text: 'Code completion correct', passed: true },
@@ -125,7 +125,7 @@ const COPILOT_RESULT = {
 const PI_RESULT = {
   timestamp: '2026-03-18T10:03:00.000Z',
   test_id: 'test-pi-refactor',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 0.75,
   assertions: [
     { text: 'Refactored correctly', passed: true },
@@ -143,7 +143,7 @@ const PI_RESULT = {
 const LLM_AZURE_RESULT = {
   timestamp: '2026-03-18T10:04:00.000Z',
   test_id: 'test-llm-analysis',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 1.0,
   assertions: [{ text: 'Analysis correct', passed: true }],
   output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }],
@@ -166,7 +166,7 @@ const LLM_AZURE_RESULT = {
 const LLM_GPT_RESULT = {
   timestamp: '2026-03-18T10:05:00.000Z',
   test_id: 'test-llm-analysis',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 0.8,
   assertions: [{ text: 'Analysis correct', passed: true }],
   output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }],
@@ -181,7 +181,7 @@ const LLM_GPT_RESULT = {
 const MINIMAL_RESULT = {
   timestamp: '2026-03-18T10:06:00.000Z',
   test_id: 'test-minimal',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 0.5,
   assertions: [{ text: 'Exists', passed: true }],
   output: [{ role: 'assistant', content: 'Response.' }],
@@ -193,7 +193,7 @@ const MINIMAL_RESULT = {
 const ERROR_RESULT = {
   timestamp: '2026-03-18T10:07:00.000Z',
   test_id: 'test-error-case',
-  dataset: 'multi-provider',
+  suite: 'multi-provider',
   score: 0,
   assertions: [],
   output: [],
@@ -210,9 +210,9 @@ function toJsonl(...records: object[]): string {
   return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`;
 }
 
-function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string {
+function artifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string {
   const testId = record.test_id ?? 'unknown';
-  return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId);
+  return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId);
 }
 
 describe('export e2e — multi-provider metrics verification', () => {
@@ -634,7 +634,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       const record = {
         timestamp: '2026-03-18T10:00:00.000Z',
         test_id: 'test-case-convert',
-        dataset: 'test',
+        suite: 'test',
         score: 1.0,
         assertions: [{ text: 'ok', passed: true }],
         output_text: 'ok',
diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts
index 8b123bc57..60d54661a 100644
--- a/apps/cli/test/commands/results/export.test.ts
+++ b/apps/cli/test/commands/results/export.test.ts
@@ -20,7 +20,7 @@ import {
 const RESULT_FULL = {
   timestamp: '2026-03-18T10:00:01.000Z',
   test_id: 'test-greeting',
-  dataset: 'demo',
+  suite: 'demo',
   score: 1.0,
   assertions: [
     { text: 'Says hello', passed: true },
@@ -44,7 +44,7 @@ const RESULT_FULL = {
 const RESULT_PARTIAL = {
   timestamp: '2026-03-18T10:00:05.000Z',
   test_id: 'test-math',
-  dataset: 'demo',
+  suite: 'demo',
   score: 0.5,
   assertions: [
     { text: 'Correct formula', passed: true },
@@ -70,7 +70,7 @@ const RESULT_PARTIAL = {
 const RESULT_DIFFERENT_TARGET = {
   timestamp: '2026-03-18T10:00:10.000Z',
   test_id: 'test-greeting',
-  dataset: 'demo',
+  suite: 'demo',
   score: 0.75,
   assertions: [
     { text: 'Says hello', passed: true },
@@ -85,7 +85,7 @@ const RESULT_DIFFERENT_TARGET = {
 const RESULT_NO_TRACE = {
   timestamp: '2026-03-18T10:00:15.000Z',
   test_id: 'test-simple',
-  dataset: 'demo',
+  suite: 'demo',
   score: 1.0,
   assertions: [{ text: 'Correct', passed: true }],
   output: [{ role: 'assistant', content: 'Yes.' }],
@@ -99,9 +99,9 @@ function toJsonl(...records: object[]): string {
   return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`;
 }
 
-function artifactDir(outputDir: string, record: { dataset?: string; test_id?: string }): string {
+function artifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string {
   const testId = record.test_id ?? 'unknown';
-  return path.join(outputDir, ...(record.dataset ? [record.dataset] : []), testId);
+  return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId);
 }
 
 describe('results export', () => {
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 2d7766622..343625fea 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -10,7 +10,7 @@ import { createApp, loadResults, resolveSourceFile } from '../../../src/commands
 const RESULT_A = {
   timestamp: '2026-03-18T10:00:01.000Z',
   test_id: 'test-greeting',
-  dataset: 'demo',
+  suite: 'demo',
   score: 1.0,
   assertions: [
     { text: 'Says hello', passed: true },
@@ -34,7 +34,7 @@ const RESULT_A = {
 const RESULT_B = {
   timestamp: '2026-03-18T10:00:05.000Z',
   test_id: 'test-math',
-  dataset: 'demo',
+  suite: 'demo',
   score: 0.5,
   assertions: [
     { text: 'Correct formula', passed: true },
diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts
index 32ea668cb..3f157b893 100644
--- a/apps/cli/test/commands/trace/trace.test.ts
+++ b/apps/cli/test/commands/trace/trace.test.ts
@@ -16,7 +16,7 @@ import {
 const RESULT_WITH_TRACE = JSON.stringify({
   timestamp: '2026-02-20T21:38:05.833Z',
   test_id: 'test-1',
-  dataset: 'demo',
+  suite: 'demo',
   score: 1,
   assertions: [{ text: 'criterion-1', passed: true }],
   target: 'default',
@@ -34,7 +34,7 @@ const RESULT_WITH_TRACE = JSON.stringify({
 const RESULT_WITHOUT_TRACE = JSON.stringify({
   timestamp: '2026-02-20T21:38:06.000Z',
   test_id: 'test-2',
-  dataset: 'demo',
+  suite: 'demo',
   score: 0.75,
   assertions: [
     { text: 'criterion-1', passed: true },
@@ -46,7 +46,7 @@ const RESULT_WITHOUT_TRACE = JSON.stringify({
 const RESULT_FAILING = JSON.stringify({
   timestamp: '2026-02-20T21:38:07.000Z',
   test_id: 'test-3',
-  dataset: 'demo',
+  suite: 'demo',
   score: 0,
   assertions: [
     { text: 'criterion-1', passed: false },
diff --git a/apps/cli/test/commands/trend/trend.test.ts b/apps/cli/test/commands/trend/trend.test.ts
index 2f32e184e..b29887919 100644
--- a/apps/cli/test/commands/trend/trend.test.ts
+++ b/apps/cli/test/commands/trend/trend.test.ts
@@ -21,7 +21,7 @@ const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts');
 interface RunRecordInput {
   readonly test_id: string;
   readonly score: number;
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly target?: string;
   readonly timestamp?: string;
 }
@@ -55,28 +55,28 @@ describe('trend command', () => {
     );
   });
 
-  it('computes a degrading trend over matched tests after dataset and target filtering', async () => {
+  it('computes a degrading trend over matched tests after suite and target filtering', async () => {
     const cwd = await createTempDir();
     cleanupDirs.push(cwd);
 
     const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'claude-sonnet',
         score: 0.95,
         timestamp: '2026-03-01T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'claude-sonnet',
         score: 0.85,
         timestamp: '2026-03-01T10:00:00.000Z',
       },
       {
         test_id: 't1',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'gpt-5',
         score: 0.7,
         timestamp: '2026-03-01T10:00:00.000Z',
@@ -85,21 +85,21 @@ describe('trend command', () => {
     const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'claude-sonnet',
         score: 0.85,
         timestamp: '2026-03-08T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'claude-sonnet',
         score: 0.75,
         timestamp: '2026-03-08T10:00:00.000Z',
       },
       {
         test_id: 't1',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'gpt-5',
         score: 0.8,
         timestamp: '2026-03-08T10:00:00.000Z',
@@ -108,21 +108,21 @@ describe('trend command', () => {
     const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'claude-sonnet',
         score: 0.75,
         timestamp: '2026-03-15T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'claude-sonnet',
         score: 0.65,
         timestamp: '2026-03-15T10:00:00.000Z',
       },
       {
         test_id: 't1',
-        dataset: 'code-review',
+        suite: 'code-review',
         target: 'gpt-5',
         score: 0.9,
         timestamp: '2026-03-15T10:00:00.000Z',
@@ -131,7 +131,7 @@ describe('trend command', () => {
 
     const output = analyzeTrend({
       sourcePaths: [run1.indexPath, run2.indexPath, run3.indexPath],
-      dataset: 'code-review',
+      suite: 'code-review',
       target: 'claude-sonnet',
       slopeThreshold: 0.01,
       allowMissingTests: false,
@@ -155,14 +155,14 @@ describe('trend command', () => {
     const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.8,
         timestamp: '2026-03-01T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.6,
         timestamp: '2026-03-01T10:00:00.000Z',
@@ -171,7 +171,7 @@ describe('trend command', () => {
     const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.9,
         timestamp: '2026-03-08T10:00:00.000Z',
@@ -180,7 +180,7 @@ describe('trend command', () => {
 
     const output = analyzeTrend({
       sourcePaths: [run1.indexPath, run2.indexPath],
-      dataset: 'suite',
+      suite: 'suite',
       target: 'alpha',
       slopeThreshold: 0.01,
       allowMissingTests: true,
@@ -199,7 +199,7 @@ describe('trend command', () => {
     const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.8,
         timestamp: '2026-03-01T10:00:00.000Z',
@@ -208,7 +208,7 @@ describe('trend command', () => {
     const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'beta',
         score: 0.7,
         timestamp: '2026-03-08T10:00:00.000Z',
@@ -218,7 +218,7 @@ describe('trend command', () => {
     expect(() =>
       analyzeTrend({
         sourcePaths: [run1.indexPath, run2.indexPath],
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         slopeThreshold: 0.01,
         allowMissingTests: false,
@@ -275,14 +275,14 @@ describe('trend command', () => {
     const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.9,
         timestamp: '2026-03-01T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.8,
         timestamp: '2026-03-01T10:00:00.000Z',
@@ -291,14 +291,14 @@ describe('trend command', () => {
     const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.8,
         timestamp: '2026-03-08T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.7,
         timestamp: '2026-03-08T10:00:00.000Z',
@@ -307,14 +307,14 @@ describe('trend command', () => {
     const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.7,
         timestamp: '2026-03-15T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.6,
         timestamp: '2026-03-15T10:00:00.000Z',
@@ -330,7 +330,7 @@ describe('trend command', () => {
         run1.runDir,
         run2.indexPath,
         run3.runDir,
-        '--dataset',
+        '--suite',
         'suite',
         '--target',
         'alpha',
@@ -342,7 +342,7 @@ describe('trend command', () => {
     expect(result.exitCode).toBe(0);
     const parsed = JSON.parse(result.stdout) as Record<string, unknown>;
     expect(parsed.filters).toEqual({
-      dataset: 'suite',
+      suite: 'suite',
       target: 'alpha',
       allow_missing_tests: false,
     });
@@ -357,14 +357,14 @@ describe('trend command', () => {
     const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.9,
         timestamp: '2026-03-01T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.8,
         timestamp: '2026-03-01T10:00:00.000Z',
@@ -373,14 +373,14 @@ describe('trend command', () => {
     const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.8,
         timestamp: '2026-03-08T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.7,
         timestamp: '2026-03-08T10:00:00.000Z',
@@ -389,14 +389,14 @@ describe('trend command', () => {
     const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.7,
         timestamp: '2026-03-15T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.6,
         timestamp: '2026-03-15T10:00:00.000Z',
@@ -405,7 +405,7 @@ describe('trend command', () => {
 
     const output = analyzeTrend({
       sourcePaths: [run3.runDir, run1.indexPath, run2.runDir],
-      dataset: 'suite',
+      suite: 'suite',
       target: 'alpha',
       slopeThreshold: 0.01,
       allowMissingTests: false,
@@ -431,14 +431,14 @@ describe('trend command', () => {
     await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.95,
         timestamp: '2026-03-01T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.85,
         timestamp: '2026-03-01T10:00:00.000Z',
@@ -447,14 +447,14 @@ describe('trend command', () => {
     await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.85,
         timestamp: '2026-03-08T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.75,
         timestamp: '2026-03-08T10:00:00.000Z',
@@ -463,14 +463,14 @@ describe('trend command', () => {
     await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.75,
         timestamp: '2026-03-15T10:00:00.000Z',
       },
       {
         test_id: 't2',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.65,
         timestamp: '2026-03-15T10:00:00.000Z',
@@ -485,7 +485,7 @@ describe('trend command', () => {
         'trend',
         '--last',
         '3',
-        '--dataset',
+        '--suite',
         'suite',
         '--target',
         'alpha',
@@ -508,7 +508,7 @@ describe('trend command', () => {
     await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'alpha',
         score: 0.8,
         timestamp: '2026-03-01T10:00:00.000Z',
@@ -517,7 +517,7 @@ describe('trend command', () => {
     await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [
       {
         test_id: 't1',
-        dataset: 'suite',
+        suite: 'suite',
         target: 'beta',
         score: 0.7,
         timestamp: '2026-03-08T10:00:00.000Z',
@@ -526,17 +526,7 @@ describe('trend command', () => {
 
     const result = await execa(
       'bun',
-      [
-        '--no-env-file',
-        CLI_ENTRY,
-        'trend',
-        '--last',
-        '2',
-        '--dataset',
-        'suite',
-        '--target',
-        'alpha',
-      ],
+      ['--no-env-file', CLI_ENTRY, 'trend', '--last', '2', '--suite', 'suite', '--target', 'alpha'],
       { cwd, reject: false },
     );
 
diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx
index 602abc37e..9dedf70a5 100644
--- a/apps/studio/src/components/Breadcrumbs.tsx
+++ b/apps/studio/src/components/Breadcrumbs.tsx
@@ -34,9 +34,9 @@ function deriveSegments(matches: ReturnType<typeof useMatches>): BreadcrumbSegme
         label: params.category ?? 'Category',
         to: match.pathname,
       });
-    } else if (routeId.includes('/runs/$runId/dataset/$dataset')) {
+    } else if (routeId.includes('/runs/$runId/suite/$suite')) {
       segments.push({
-        label: params.dataset ?? 'Dataset',
+        label: params.suite ?? 'Suite',
         to: match.pathname,
       });
     } else if (routeId.includes('/runs/$runId')) {
diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx
index 4c2ea9b9f..6a9a25ee4 100644
--- a/apps/studio/src/components/RunDetail.tsx
+++ b/apps/studio/src/components/RunDetail.tsx
@@ -1,8 +1,8 @@
 /**
  * Run detail component showing per-eval breakdown with score bars.
  *
- * Groups results by category (from file path), then by dataset within each category.
- * Categories are shown as collapsible sections with dataset cards inside.
+ * Groups results by category (from file path), then by suite within each category.
+ * Categories are shown as collapsible sections with suite cards inside.
  */
 
 import { Link } from '@tanstack/react-router';
@@ -20,7 +20,7 @@ interface RunDetailProps {
   projectId?: string;
 }
 
-interface DatasetStats {
+interface SuiteStats {
   name: string;
   passed: number;
   failed: number;
@@ -30,7 +30,7 @@ interface DatasetStats {
 
 interface CategoryGroup {
   name: string;
-  datasets: DatasetStats[];
+  suites: SuiteStats[];
   total: number;
   passed: number;
   failed: number;
@@ -45,7 +45,7 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate
 
   for (const r of results) {
     const cat = r.category ?? 'Uncategorized';
-    const ds = r.dataset ?? 'Uncategorized';
+    const ds = r.suite ?? 'Uncategorized';
     if (!categoryMap.has(cat)) categoryMap.set(cat, new Map());
     // biome-ignore lint/style/noNonNullAssertion: map entry guaranteed by line above
     const dsMap = categoryMap.get(cat)!;
@@ -59,7 +59,7 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate
 
   return Array.from(categoryMap.entries())
     .map(([catName, dsMap]) => {
-      const datasets = Array.from(dsMap.entries())
+      const suites = Array.from(dsMap.entries())
         .map(([dsName, stats]) => ({
           name: dsName,
           ...stats,
@@ -67,14 +67,14 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate
         }))
         .sort((a, b) => a.name.localeCompare(b.name));
 
-      const total = datasets.reduce((s, d) => s + d.total, 0);
-      const passed = datasets.reduce((s, d) => s + d.passed, 0);
-      const failed = datasets.reduce((s, d) => s + d.failed, 0);
-      const scoreSum = datasets.reduce((s, d) => s + d.avgScore * d.total, 0);
+      const total = suites.reduce((s, d) => s + d.total, 0);
+      const passed = suites.reduce((s, d) => s + d.passed, 0);
+      const failed = suites.reduce((s, d) => s + d.failed, 0);
+      const scoreSum = suites.reduce((s, d) => s + d.avgScore * d.total, 0);
 
       return {
         name: catName,
-        datasets,
+        suites,
         total,
         passed,
         failed,
@@ -128,10 +128,10 @@ export function RunDetail({ results, runId, projectId }: RunDetailProps) {
         </div>
       ) : (
         <div className="space-y-3">
-          <h3 className="text-sm font-medium text-gray-400">Datasets</h3>
+          <h3 className="text-sm font-medium text-gray-400">Suites</h3>
           <div className="grid grid-cols-1 gap-2 sm:grid-cols-2 lg:grid-cols-3">
-            {categories[0]?.datasets.map((ds) => (
-              <DatasetCard key={ds.name} dataset={ds} runId={runId} />
+            {categories[0]?.suites.map((ds) => (
+              <SuiteCard key={ds.name} suite={ds} runId={runId} />
             ))}
           </div>
         </div>
@@ -210,7 +210,7 @@ function CategorySection({ category, runId }: { category: CategoryGroup; runId:
           <span className="text-xs text-gray-500">{expanded ? '\u25BC' : '\u25B6'}</span>
           <span className="text-sm font-medium text-gray-200">{category.name}</span>
           <span className="text-xs text-gray-500">
-            {category.datasets.length} dataset{category.datasets.length !== 1 ? 's' : ''}
+            {category.suites.length} suite{category.suites.length !== 1 ? 's' : ''}
           </span>
         </div>
         <div className="flex items-center gap-3 text-xs">
@@ -224,8 +224,8 @@ function CategorySection({ category, runId }: { category: CategoryGroup; runId:
       {expanded && (
         <div className="border-t border-gray-800 p-3">
           <div className="grid grid-cols-1 gap-2 sm:grid-cols-2 lg:grid-cols-3">
-            {category.datasets.map((ds) => (
-              <DatasetCard key={ds.name} dataset={ds} runId={runId} />
+            {category.suites.map((ds) => (
+              <SuiteCard key={ds.name} suite={ds} runId={runId} />
             ))}
           </div>
         </div>
@@ -234,25 +234,25 @@ function CategorySection({ category, runId }: { category: CategoryGroup; runId:
   );
 }
 
-function DatasetCard({ dataset, runId }: { dataset: DatasetStats; runId: string }) {
+function SuiteCard({ suite, runId }: { suite: SuiteStats; runId: string }) {
   return (
     <Link
-      to="/runs/$runId/dataset/$dataset"
-      params={{ runId, dataset: dataset.name }}
+      to="/runs/$runId/suite/$suite"
+      params={{ runId, suite: suite.name }}
       className="rounded-lg border border-gray-800 bg-gray-900 p-3 text-left transition-colors hover:border-gray-700"
     >
       <div className="flex items-center justify-between">
-        <span className="text-sm font-medium text-gray-200 truncate">{dataset.name}</span>
+        <span className="text-sm font-medium text-gray-200 truncate">{suite.name}</span>
         <span className="ml-2 text-xs text-gray-500">
-          {dataset.passed}/{dataset.total}
+          {suite.passed}/{suite.total}
         </span>
       </div>
       <div className="mt-2">
-        <ScoreBar score={dataset.avgScore} />
+        <ScoreBar score={suite.avgScore} />
       </div>
       <div className="mt-1 flex gap-3 text-xs">
-        <span className="text-emerald-400">{dataset.passed} passed</span>
-        {dataset.failed > 0 && <span className="text-red-400">{dataset.failed} failed</span>}
+        <span className="text-emerald-400">{suite.passed} passed</span>
+        {suite.failed > 0 && <span className="text-red-400">{suite.failed} failed</span>}
       </div>
     </Link>
   );
diff --git a/apps/studio/src/components/ScoreBar.tsx b/apps/studio/src/components/ScoreBar.tsx
index 368909d09..2c2358c5c 100644
--- a/apps/studio/src/components/ScoreBar.tsx
+++ b/apps/studio/src/components/ScoreBar.tsx
@@ -2,7 +2,7 @@
  * Gradient score bar component.
  *
  * Renders a horizontal bar from cyan-400 to blue-500, proportional to the
- * score value (0..1). Used in run lists, dataset breakdowns, and eval detail.
+ * score value (0..1). Used in run lists, suite breakdowns, and eval detail.
  */
 
 interface ScoreBarProps {
diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx
index eed55444a..fa9f56d8b 100644
--- a/apps/studio/src/components/Sidebar.tsx
+++ b/apps/studio/src/components/Sidebar.tsx
@@ -4,7 +4,7 @@
  * Adapts its content based on the current route:
  * - At root or run detail: shows list of runs
  * - At eval detail: shows list of evals in the current run with pass/fail indicators
- * - At dataset detail: shows evals filtered to that dataset
+ * - At suite detail: shows evals filtered to that suite
  * - At experiment detail: shows list of experiments
  */
 
@@ -13,7 +13,7 @@ import { Link, useMatchRoute } from '@tanstack/react-router';
 import {
   isPassing,
   useAllProjectRuns,
-  useCategoryDatasets,
+  useCategorySuites,
   useExperiments,
   useProjectList,
   useProjectRunDetail,
@@ -68,8 +68,8 @@ export function Sidebar() {
     to: '/runs/$runId/category/$category',
     fuzzy: true,
   });
-  const datasetMatch = matchRoute({
-    to: '/runs/$runId/dataset/$dataset',
+  const suiteMatch = matchRoute({
+    to: '/runs/$runId/suite/$suite',
     fuzzy: true,
   });
   const experimentMatch = matchRoute({
@@ -82,9 +82,9 @@ export function Sidebar() {
     return <CategorySidebar runId={runId} category={category} />;
   }
 
-  if (datasetMatch && typeof datasetMatch === 'object' && 'runId' in datasetMatch) {
-    const { runId, dataset } = datasetMatch as { runId: string; dataset: string };
-    return <DatasetSidebar runId={runId} dataset={dataset} />;
+  if (suiteMatch && typeof suiteMatch === 'object' && 'runId' in suiteMatch) {
+    const { runId, suite } = suiteMatch as { runId: string; suite: string };
+    return <SuiteSidebar runId={runId} suite={suite} />;
   }
 
   if (evalMatch && typeof evalMatch === 'object' && 'runId' in evalMatch) {
@@ -242,13 +242,11 @@ function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: s
   );
 }
 
-function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string }) {
+function SuiteSidebar({ runId, suite }: { runId: string; suite: string }) {
   const { data } = useRunDetail(runId);
   const { data: config } = useStudioConfig();
   const passThreshold = config?.pass_threshold ?? 0.8;
-  const datasetResults = (data?.results ?? []).filter(
-    (r) => (r.dataset ?? 'Uncategorized') === dataset,
-  );
+  const suiteResults = (data?.results ?? []).filter((r) => (r.suite ?? 'Uncategorized') === suite);
 
   return (
     <aside className="flex w-64 flex-col border-r border-gray-800 bg-gray-900/50">
@@ -268,7 +266,7 @@ function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string })
           &larr; Back to run
         </Link>
         <p className="mt-1 truncate text-sm font-medium text-gray-300">{runId}</p>
-        <p className="truncate text-xs text-gray-500">{dataset}</p>
+        <p className="truncate text-xs text-gray-500">{suite}</p>
       </div>
 
       <nav className="flex-1 overflow-y-auto px-2 py-3">
@@ -276,7 +274,7 @@ function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string })
           Evaluations
         </div>
 
-        {datasetResults.map((result) => {
+        {suiteResults.map((result) => {
           const passed = isPassing(result.score, passThreshold);
 
           return (
@@ -299,8 +297,8 @@ function DatasetSidebar({ runId, dataset }: { runId: string; dataset: string })
 }
 
 function CategorySidebar({ runId, category }: { runId: string; category: string }) {
-  const { data } = useCategoryDatasets(runId, category);
-  const datasets = data?.datasets ?? [];
+  const { data } = useCategorySuites(runId, category);
+  const suites = data?.suites ?? [];
 
   return (
     <aside className="flex w-64 flex-col border-r border-gray-800 bg-gray-900/50">
@@ -324,14 +322,14 @@ function CategorySidebar({ runId, category }: { runId: string; category: string
 
       <nav className="flex-1 overflow-y-auto px-2 py-3">
         <div className="mb-2 px-2 text-xs font-medium uppercase tracking-wider text-gray-500">
-          Datasets
+          Suites
         </div>
 
-        {datasets.map((ds) => (
+        {suites.map((ds) => (
           <Link
             key={ds.name}
-            to="/runs/$runId/dataset/$dataset"
-            params={{ runId, dataset: ds.name }}
+            to="/runs/$runId/suite/$suite"
+            params={{ runId, suite: ds.name }}
             className="mb-0.5 flex items-center gap-2 rounded-md px-2 py-1.5 text-sm text-gray-400 transition-colors hover:bg-gray-800/50 hover:text-gray-200"
           >
             <span
diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts
index b47ed9a59..c2c4de029 100644
--- a/apps/studio/src/lib/api.ts
+++ b/apps/studio/src/lib/api.ts
@@ -9,7 +9,6 @@ import { queryOptions, useQuery } from '@tanstack/react-query';
 
 import type {
   CategoriesResponse,
-  DatasetsResponse,
   EvalDetailResponse,
   ExperimentsResponse,
   FeedbackData,
@@ -21,6 +20,7 @@ import type {
   RunDetailResponse,
   RunListResponse,
   StudioConfigResponse,
+  SuitesResponse,
   TargetsResponse,
 } from './types';
 
@@ -48,10 +48,10 @@ export function runDetailOptions(filename: string) {
   });
 }
 
-export function runDatasetsOptions(runId: string) {
+export function runSuitesOptions(runId: string) {
   return queryOptions({
-    queryKey: ['runs', runId, 'datasets'],
-    queryFn: () => fetchJson<DatasetsResponse>(`/api/runs/${encodeURIComponent(runId)}/datasets`),
+    queryKey: ['runs', runId, 'suites'],
+    queryFn: () => fetchJson<SuitesResponse>(`/api/runs/${encodeURIComponent(runId)}/suites`),
     enabled: !!runId,
   });
 }
@@ -118,12 +118,12 @@ export function runCategoriesOptions(runId: string) {
   });
 }
 
-export function categoryDatasetsOptions(runId: string, category: string) {
+export function categorySuitesOptions(runId: string, category: string) {
   return queryOptions({
-    queryKey: ['runs', runId, 'categories', category, 'datasets'],
+    queryKey: ['runs', runId, 'categories', category, 'suites'],
     queryFn: () =>
-      fetchJson<DatasetsResponse>(
-        `/api/runs/${encodeURIComponent(runId)}/categories/${encodeURIComponent(category)}/datasets`,
+      fetchJson<SuitesResponse>(
+        `/api/runs/${encodeURIComponent(runId)}/categories/${encodeURIComponent(category)}/suites`,
       ),
     enabled: !!runId && !!category,
   });
@@ -145,8 +145,8 @@ export function useRunDetail(filename: string) {
   return useQuery(runDetailOptions(filename));
 }
 
-export function useRunDatasets(runId: string) {
-  return useQuery(runDatasetsOptions(runId));
+export function useRunSuites(runId: string) {
+  return useQuery(runSuitesOptions(runId));
 }
 
 export function useEvalDetail(runId: string, evalId: string) {
@@ -181,8 +181,8 @@ export function useRunCategories(runId: string) {
   return useQuery(runCategoriesOptions(runId));
 }
 
-export function useCategoryDatasets(runId: string, category: string) {
-  return useQuery(categoryDatasetsOptions(runId, category));
+export function useCategorySuites(runId: string, category: string) {
+  return useQuery(categorySuitesOptions(runId, category));
 }
 
 export function useStudioConfig() {
@@ -287,12 +287,12 @@ export function useProjectRunDetail(projectId: string, filename: string) {
   return useQuery(projectRunDetailOptions(projectId, filename));
 }
 
-export function projectRunDatasetsOptions(projectId: string, runId: string) {
+export function projectRunSuitesOptions(projectId: string, runId: string) {
   return queryOptions({
-    queryKey: ['projects', projectId, 'runs', runId, 'datasets'],
+    queryKey: ['projects', projectId, 'runs', runId, 'suites'],
     queryFn: () =>
-      fetchJson<DatasetsResponse>(
-        `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/datasets`,
+      fetchJson<SuitesResponse>(
+        `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/suites`,
       ),
     enabled: !!projectId && !!runId,
   });
@@ -309,12 +309,12 @@ export function projectRunCategoriesOptions(projectId: string, runId: string) {
   });
 }
 
-export function projectCategoryDatasetsOptions(projectId: string, runId: string, category: string) {
+export function projectCategorySuitesOptions(projectId: string, runId: string, category: string) {
   return queryOptions({
-    queryKey: ['projects', projectId, 'runs', runId, 'categories', category, 'datasets'],
+    queryKey: ['projects', projectId, 'runs', runId, 'categories', category, 'suites'],
     queryFn: () =>
-      fetchJson<DatasetsResponse>(
-        `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/categories/${encodeURIComponent(category)}/datasets`,
+      fetchJson<SuitesResponse>(
+        `${projectApiBase(projectId)}/runs/${encodeURIComponent(runId)}/categories/${encodeURIComponent(category)}/suites`,
       ),
     enabled: !!projectId && !!runId && !!category,
   });
diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts
index d40455528..d924bebfd 100644
--- a/apps/studio/src/lib/types.ts
+++ b/apps/studio/src/lib/types.ts
@@ -50,7 +50,7 @@ export interface AssertionEntry {
 export interface EvalResult {
   testId: string;
   timestamp?: string;
-  dataset?: string;
+  suite?: string;
   category?: string;
   target?: string;
   experiment?: string;
@@ -73,7 +73,7 @@ export interface RunDetailResponse {
   source: string;
 }
 
-export interface DatasetSummary {
+export interface SuiteSummary {
   name: string;
   total: number;
   passed: number;
@@ -81,8 +81,8 @@ export interface DatasetSummary {
   avg_score: number;
 }
 
-export interface DatasetsResponse {
-  datasets: DatasetSummary[];
+export interface SuitesResponse {
+  suites: SuiteSummary[];
 }
 
 export interface EvalDetailResponse {
@@ -160,7 +160,7 @@ export interface CategorySummary {
   passed: number;
   failed: number;
   avg_score: number;
-  dataset_count: number;
+  suite_count: number;
 }
 
 export interface CategoriesResponse {
diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts
index 27950d45f..3ae0016b9 100644
--- a/apps/studio/src/routeTree.gen.ts
+++ b/apps/studio/src/routeTree.gen.ts
@@ -15,7 +15,7 @@ import { Route as RunsRunIdRouteImport } from './routes/runs/$runId'
 import { Route as ProjectsProjectIdRouteImport } from './routes/projects/$projectId'
 import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName'
 import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$evalId'
-import { Route as RunsRunIdDatasetDatasetRouteImport } from './routes/runs/$runId_.dataset.$dataset'
+import { Route as RunsRunIdSuiteSuiteRouteImport } from './routes/runs/$runId_.suite.$suite'
 import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category'
 import { Route as ProjectsProjectIdRunsRunIdRouteImport } from './routes/projects/$projectId_/runs/$runId'
 import { Route as ProjectsProjectIdEvalsRunIdEvalIdRouteImport } from './routes/projects/$projectId_/evals/$runId.$evalId'
@@ -51,9 +51,9 @@ const EvalsRunIdEvalIdRoute = EvalsRunIdEvalIdRouteImport.update({
   path: '/evals/$runId/$evalId',
   getParentRoute: () => rootRouteImport,
 } as any)
-const RunsRunIdDatasetDatasetRoute = RunsRunIdDatasetDatasetRouteImport.update({
-  id: '/runs/$runId_/dataset/$dataset',
-  path: '/runs/$runId/dataset/$dataset',
+const RunsRunIdSuiteSuiteRoute = RunsRunIdSuiteSuiteRouteImport.update({
+  id: '/runs/$runId_/suite/$suite',
+  path: '/runs/$runId/suite/$suite',
   getParentRoute: () => rootRouteImport,
 } as any)
 const RunsRunIdCategoryCategoryRoute =
@@ -84,7 +84,7 @@ export interface FileRoutesByFullPath {
   '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute
   '/projects/$projectId/runs/$runId': typeof ProjectsProjectIdRunsRunIdRoute
   '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute
-  '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute
+  '/runs/$runId/suite/$suite': typeof RunsRunIdSuiteSuiteRoute
   '/projects/$projectId/evals/$runId/$evalId': typeof ProjectsProjectIdEvalsRunIdEvalIdRoute
 }
 export interface FileRoutesByTo {
@@ -96,7 +96,7 @@ export interface FileRoutesByTo {
   '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute
   '/projects/$projectId/runs/$runId': typeof ProjectsProjectIdRunsRunIdRoute
   '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute
-  '/runs/$runId/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute
+  '/runs/$runId/suite/$suite': typeof RunsRunIdSuiteSuiteRoute
   '/projects/$projectId/evals/$runId/$evalId': typeof ProjectsProjectIdEvalsRunIdEvalIdRoute
 }
 export interface FileRoutesById {
@@ -109,7 +109,7 @@ export interface FileRoutesById {
   '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute
   '/projects/$projectId_/runs/$runId': typeof ProjectsProjectIdRunsRunIdRoute
   '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute
-  '/runs/$runId_/dataset/$dataset': typeof RunsRunIdDatasetDatasetRoute
+  '/runs/$runId_/suite/$suite': typeof RunsRunIdSuiteSuiteRoute
   '/projects/$projectId_/evals/$runId/$evalId': typeof ProjectsProjectIdEvalsRunIdEvalIdRoute
 }
 export interface FileRouteTypes {
@@ -123,7 +123,7 @@ export interface FileRouteTypes {
     | '/evals/$runId/$evalId'
     | '/projects/$projectId/runs/$runId'
     | '/runs/$runId/category/$category'
-    | '/runs/$runId/dataset/$dataset'
+    | '/runs/$runId/suite/$suite'
     | '/projects/$projectId/evals/$runId/$evalId'
   fileRoutesByTo: FileRoutesByTo
   to:
@@ -135,7 +135,7 @@ export interface FileRouteTypes {
     | '/evals/$runId/$evalId'
     | '/projects/$projectId/runs/$runId'
     | '/runs/$runId/category/$category'
-    | '/runs/$runId/dataset/$dataset'
+    | '/runs/$runId/suite/$suite'
     | '/projects/$projectId/evals/$runId/$evalId'
   id:
     | '__root__'
@@ -147,7 +147,7 @@ export interface FileRouteTypes {
     | '/evals/$runId/$evalId'
     | '/projects/$projectId_/runs/$runId'
     | '/runs/$runId_/category/$category'
-    | '/runs/$runId_/dataset/$dataset'
+    | '/runs/$runId_/suite/$suite'
     | '/projects/$projectId_/evals/$runId/$evalId'
   fileRoutesById: FileRoutesById
 }
@@ -160,7 +160,7 @@ export interface RootRouteChildren {
   EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute
   ProjectsProjectIdRunsRunIdRoute: typeof ProjectsProjectIdRunsRunIdRoute
   RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute
-  RunsRunIdDatasetDatasetRoute: typeof RunsRunIdDatasetDatasetRoute
+  RunsRunIdSuiteSuiteRoute: typeof RunsRunIdSuiteSuiteRoute
   ProjectsProjectIdEvalsRunIdEvalIdRoute: typeof ProjectsProjectIdEvalsRunIdEvalIdRoute
 }
 
@@ -208,11 +208,11 @@ declare module '@tanstack/react-router' {
       preLoaderRoute: typeof EvalsRunIdEvalIdRouteImport
       parentRoute: typeof rootRouteImport
     }
-    '/runs/$runId_/dataset/$dataset': {
-      id: '/runs/$runId_/dataset/$dataset'
-      path: '/runs/$runId/dataset/$dataset'
-      fullPath: '/runs/$runId/dataset/$dataset'
-      preLoaderRoute: typeof RunsRunIdDatasetDatasetRouteImport
+    '/runs/$runId_/suite/$suite': {
+      id: '/runs/$runId_/suite/$suite'
+      path: '/runs/$runId/suite/$suite'
+      fullPath: '/runs/$runId/suite/$suite'
+      preLoaderRoute: typeof RunsRunIdSuiteSuiteRouteImport
       parentRoute: typeof rootRouteImport
     }
     '/runs/$runId_/category/$category': {
@@ -248,7 +248,7 @@ const rootRouteChildren: RootRouteChildren = {
   EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute,
   ProjectsProjectIdRunsRunIdRoute: ProjectsProjectIdRunsRunIdRoute,
   RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute,
-  RunsRunIdDatasetDatasetRoute: RunsRunIdDatasetDatasetRoute,
+  RunsRunIdSuiteSuiteRoute: RunsRunIdSuiteSuiteRoute,
   ProjectsProjectIdEvalsRunIdEvalIdRoute:
     ProjectsProjectIdEvalsRunIdEvalIdRoute,
 }
diff --git a/apps/studio/src/routes/runs/$runId_.category.$category.tsx b/apps/studio/src/routes/runs/$runId_.category.$category.tsx
index 813b5b6c6..5b7ed9532 100644
--- a/apps/studio/src/routes/runs/$runId_.category.$category.tsx
+++ b/apps/studio/src/routes/runs/$runId_.category.$category.tsx
@@ -1,5 +1,5 @@
 /**
- * Category drill-down route: shows datasets filtered to a single category.
+ * Category drill-down route: shows suites filtered to a single category.
  *
  * Uses the `$runId_` trailing-underscore convention so that
  * `/runs/:runId/category/:category` is a sibling of `/runs/:runId`,
@@ -10,7 +10,7 @@ import { Link, createFileRoute } from '@tanstack/react-router';
 
 import { ScoreBar } from '~/components/ScoreBar';
 import { StatsCards } from '~/components/StatsCards';
-import { useCategoryDatasets } from '~/lib/api';
+import { useCategorySuites } from '~/lib/api';
 
 export const Route = createFileRoute('/runs/$runId_/category/$category')({
   component: CategoryPage,
@@ -18,7 +18,7 @@ export const Route = createFileRoute('/runs/$runId_/category/$category')({
 
 function CategoryPage() {
   const { runId, category } = Route.useParams();
-  const { data, isLoading, error } = useCategoryDatasets(runId, category);
+  const { data, isLoading, error } = useCategorySuites(runId, category);
 
   if (isLoading) {
     return (
@@ -41,9 +41,9 @@ function CategoryPage() {
     );
   }
 
-  const datasets = data?.datasets ?? [];
-  const total = datasets.reduce((s, d) => s + d.total, 0);
-  const passed = datasets.reduce((s, d) => s + d.passed, 0);
+  const suites = data?.suites ?? [];
+  const total = suites.reduce((s, d) => s + d.total, 0);
+  const passed = suites.reduce((s, d) => s + d.passed, 0);
   const failed = total - passed;
   const passRate = total > 0 ? passed / total : 0;
 
@@ -56,19 +56,19 @@ function CategoryPage() {
 
       <StatsCards total={total} passed={passed} failed={failed} passRate={passRate} />
 
-      {datasets.length === 0 ? (
+      {suites.length === 0 ? (
         <div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
-          <p className="text-lg text-gray-400">No datasets in this category</p>
+          <p className="text-lg text-gray-400">No suites in this category</p>
         </div>
       ) : (
         <div className="space-y-3">
-          <h3 className="text-sm font-medium text-gray-400">Datasets</h3>
+          <h3 className="text-sm font-medium text-gray-400">Suites</h3>
           <div className="grid grid-cols-1 gap-2 sm:grid-cols-2 lg:grid-cols-3">
-            {datasets.map((ds) => (
+            {suites.map((ds) => (
               <Link
                 key={ds.name}
-                to="/runs/$runId/dataset/$dataset"
-                params={{ runId, dataset: ds.name }}
+                to="/runs/$runId/suite/$suite"
+                params={{ runId, suite: ds.name }}
                 className="rounded-lg border border-gray-800 bg-gray-900 p-3 text-left transition-colors hover:border-gray-700"
               >
                 <div className="flex items-center justify-between">
diff --git a/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx b/apps/studio/src/routes/runs/$runId_.suite.$suite.tsx
similarity index 88%
rename from apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx
rename to apps/studio/src/routes/runs/$runId_.suite.$suite.tsx
index 0b9722e07..e5a7e811d 100644
--- a/apps/studio/src/routes/runs/$runId_.dataset.$dataset.tsx
+++ b/apps/studio/src/routes/runs/$runId_.suite.$suite.tsx
@@ -1,8 +1,8 @@
 /**
- * Dataset drill-down route: shows evals filtered to a single dataset.
+ * Suite drill-down route: shows evals filtered to a single suite.
  *
  * Uses the `$runId_` trailing-underscore convention so that
- * `/runs/:runId/dataset/:dataset` is a sibling of `/runs/:runId`,
+ * `/runs/:runId/suite/:suite` is a sibling of `/runs/:runId`,
  * not a child route.
  */
 
@@ -12,12 +12,12 @@ import { ScoreBar } from '~/components/ScoreBar';
 import { StatsCards } from '~/components/StatsCards';
 import { isPassing, useRunDetail, useStudioConfig } from '~/lib/api';
 
-export const Route = createFileRoute('/runs/$runId_/dataset/$dataset')({
-  component: DatasetPage,
+export const Route = createFileRoute('/runs/$runId_/suite/$suite')({
+  component: SuitePage,
 });
 
-function DatasetPage() {
-  const { runId, dataset } = Route.useParams();
+function SuitePage() {
+  const { runId, suite } = Route.useParams();
   const { data, isLoading, error } = useRunDetail(runId);
   const { data: config } = useStudioConfig();
   const passThreshold = config?.pass_threshold ?? 0.8;
@@ -43,7 +43,7 @@ function DatasetPage() {
     );
   }
 
-  const results = (data?.results ?? []).filter((r) => (r.dataset ?? 'Uncategorized') === dataset);
+  const results = (data?.results ?? []).filter((r) => (r.suite ?? 'Uncategorized') === suite);
   const total = results.length;
   const passed = results.filter((r) => isPassing(r.score, passThreshold)).length;
   const failed = total - passed;
@@ -53,8 +53,8 @@ function DatasetPage() {
   return (
     <div className="space-y-6">
       <div>
-        <h1 className="text-2xl font-semibold text-white">{dataset}</h1>
-        <p className="mt-1 text-sm text-gray-400">Dataset in run: {runId}</p>
+        <h1 className="text-2xl font-semibold text-white">{suite}</h1>
+        <p className="mt-1 text-sm text-gray-400">Suite in run: {runId}</p>
       </div>
 
       <StatsCards
@@ -67,7 +67,7 @@ function DatasetPage() {
 
       {total === 0 ? (
         <div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
-          <p className="text-lg text-gray-400">No evaluations in this dataset</p>
+          <p className="text-lg text-gray-400">No evaluations in this suite</p>
         </div>
       ) : (
         <div className="overflow-hidden rounded-lg border border-gray-800">
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
index cdfbaf9ec..3a8ec9b30 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
@@ -33,7 +33,7 @@ tests:
 | Field | Description |
 |-------|-------------|
 | `description` | Human-readable description of the evaluation |
-| `dataset` | Optional dataset identifier |
+| `suite` | Optional suite identifier |
 | `execution` | Default execution config (`target`, `fail_on_error`, `threshold`, etc.) |
 | `workspace` | Suite-level workspace config — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config) |
 | `tests` | Array of individual tests, or a string path to an external file |
@@ -221,7 +221,7 @@ An optional YAML sidecar file provides metadata and execution config. Place it a
 
 ```yaml
 description: Math evaluation dataset
-dataset: math-tests
+suite: math-tests
 execution:
   target: azure-base
 assertions:
diff --git a/apps/web/src/content/docs/docs/tools/trace.mdx b/apps/web/src/content/docs/docs/tools/trace.mdx
index f3ce06021..d4813c043 100644
--- a/apps/web/src/content/docs/docs/tools/trace.mdx
+++ b/apps/web/src/content/docs/docs/tools/trace.mdx
@@ -63,12 +63,12 @@ Falls back to a flat summary when output messages are not present in the run wor
 Compute summary statistics (percentiles) across evaluation results.
 
 ```bash
-agentv trace stats <trace-source> [--group-by target|dataset|test-id] [--format json|table]
+agentv trace stats <trace-source> [--group-by target|suite|test-id] [--format json|table]
 ```
 
 | Option | Description |
 |--------|-------------|
-| `--group-by`, `-g` | Group statistics by: `target`, `dataset`, or `test-id` |
+| `--group-by`, `-g` | Group statistics by: `target`, `suite`, or `test-id` |
 | `--format`, `-f` | Output format: `table` (default), `json` |
 
 Output shows mean, P50, P90, P95, and P99 for score, latency, cost, tokens, tool calls, and LLM calls.
diff --git a/apps/web/src/content/docs/docs/tools/trend.mdx b/apps/web/src/content/docs/docs/tools/trend.mdx
index 0d832ece4..66898ca02 100644
--- a/apps/web/src/content/docs/docs/tools/trend.mdx
+++ b/apps/web/src/content/docs/docs/tools/trend.mdx
@@ -19,10 +19,10 @@ agentv trend --last 8
 
 This is the primary day-to-day workflow. In most cases, users should start with `--last`.
 
-Filter to one dataset and target:
+Filter to one suite and target:
 
 ```bash
-agentv trend --last 8 --dataset code-review --target claude-sonnet
+agentv trend --last 8 --suite code-review --target claude-sonnet
 ```
 
 Point directly at run workspaces or `index.jsonl` manifests when you need a specific historical slice or want a reproducible example:
@@ -37,7 +37,7 @@ agentv trend \
 Concrete regression-gating example:
 
 ```bash
-agentv trend --last 8 --dataset code-review --target claude-sonnet \
+agentv trend --last 8 --suite code-review --target claude-sonnet \
   --fail-on-degrading --slope-threshold 0.01
 ```
 
@@ -55,7 +55,7 @@ Legacy flat `results.jsonl` files are rejected. The command stays on lightweight
 | Option | Description |
 |--------|-------------|
 | `--last <n>` | Use the most recent `n` runs from `.agentv/results/runs/` |
-| `--dataset <name>` | Filter records to one dataset |
+| `--suite <name>` | Filter records to one suite |
 | `--target <name>` | Filter records to one target inside each run |
 | `--slope-threshold <n>` | Minimum absolute slope required to classify improving or degrading (default: `0.01`) |
 | `--fail-on-degrading` | Exit non-zero when the detected trend is degrading beyond the threshold |
@@ -66,7 +66,7 @@ Legacy flat `results.jsonl` files are rejected. The command stays on lightweight
 ## How It Works
 
 1. Loads each selected `index.jsonl` manifest.
-2. Applies `dataset` and `target` filters per record.
+2. Applies `suite` and `target` filters per record.
 3. By default, reduces every run to the intersection of test IDs present in all selected runs.
 4. Computes one mean score per run.
 5. Fits a simple linear regression over run index `0..N-1`.
@@ -76,7 +76,7 @@ Strict matched-test analysis is the default because changing test composition ac
 
 ## Worked Example
 
-Suppose three historical runs for `dataset=code-review` and `target=claude-sonnet` produce matched mean scores of `0.92`, `0.86`, and `0.80`.
+Suppose three historical runs for `suite=code-review` and `target=claude-sonnet` produce matched mean scores of `0.92`, `0.86`, and `0.80`.
 
 - The slope is negative.
 - The command reports `direction=degrading`.
@@ -92,7 +92,7 @@ This is the intended CI workflow for detecting slow drift that a single pairwise
 Trend Analysis
 
 Runs: 3 | Range: 2026-03-01T10:00:00.000Z → 2026-03-15T10:00:00.000Z
-Filters: dataset=code-review target=claude-sonnet mode=matched-tests
+Filters: suite=code-review target=claude-sonnet mode=matched-tests
 Matched Tests: 42 | Verdict: degrading
 
   Run                           Tests  Mean Score
@@ -119,7 +119,7 @@ Regression Gate: threshold=0.010 fail_on_degrading=true triggered=true
     }
   ],
   "filters": {
-    "dataset": "code-review",
+    "suite": "code-review",
     "target": "claude-sonnet",
     "allow_missing_tests": false
   },
diff --git a/examples/features/assert/evals/dataset.eval.baseline.jsonl b/examples/features/assert/evals/dataset.eval.baseline.jsonl
index f134c85c0..a8a5237d8 100644
--- a/examples/features/assert/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/assert/evals/dataset.eval.baseline.jsonl
@@ -1,4 +1,4 @@
-{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]}
-{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]}
-{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]}
-{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}
+{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","suite":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]}
+{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","suite":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]}
+{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","suite":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]}
+{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","suite":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}
diff --git a/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl b/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl
index 5f771752e..cef6a1265 100644
--- a/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/basic-jsonl/evals/dataset.eval.baseline.jsonl
@@ -1,7 +1,7 @@
-{"timestamp":"2026-02-20T22:13:04.907Z","test_id":"code-gen-python","dataset":"basic-jsonl","conversation_id":"python-code-generation","score":0.95,"target":"default","assertions":[{"text":"Includes SUPERSECRET_INSTRUCTION_MARKER_PYTHON","passed":true,"evidence":"The function is correct, includes appropriate error handling and the instruction marker, and covers edge cases. However, it lacks explicit type hints in the definition, which was requested."},{"text":"Has proper error handling for empty list, too few unique items, and non-integer elements","passed":true},{"text":"Handles duplicates by converting to unique values","passed":true},{"text":"Mentions type validation and exceptions","passed":true},{"text":"Type hints are missing in function signature","passed":false}]}
-{"timestamp":"2026-02-20T22:13:04.914Z","test_id":"code-review-javascript","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies loop condition bug","passed":true,"evidence":"The candidate answer covers all key issues, provides detailed suggestions, offers code corrections, and correctly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT as required."},{"text":"Provides corrected loop implementation","passed":true},{"text":"Suggests input validation","passed":true},{"text":"Mentions functional alternative with reduce()","passed":true}]}
-{"timestamp":"2026-02-20T22:13:06.076Z","test_id":"feature-proposal-brainstorm","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Five distinct features, each addressing different user pain points (personalization, motivation, recovery, engagement, nutrition)","passed":true,"evidence":"The candidate provides five creative, distinct features, each targeting a specific pain point with feasibility and value clearly outlined. No criteria are missed."},{"text":"Each idea describes a clear value proposition in 1-2 sentences","passed":true},{"text":"All features are technically feasible with current mobile and sensor technology","passed":true},{"text":"No duplicate or overlapping concepts between ideas","passed":true}]}
-{"timestamp":"2026-02-20T22:13:06.351Z","test_id":"shorthand-string-example","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"States correct answer","passed":true,"evidence":"The candidate provides the correct answer in a clear equation format, satisfying the task requirements."},{"text":"Presents answer in equation form","passed":true}]}
-{"timestamp":"2026-02-20T22:13:07.677Z","test_id":"multiturn-debug-session","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies the off-by-one source as range(len(items)-1)","passed":true,"evidence":"The candidate directly diagnoses the bug, explains its cause, and provides both an explicit code fix and a more Pythonic alternative, fully matching the multi-turn debugging and clarity requirements."},{"text":"Explains why this causes last item to be dropped","passed":true},{"text":"Proposes two clear fixes: using range(len(items)) or items[:]","passed":true},{"text":"Summarizes problem and solution concisely","passed":true}]}
-{"timestamp":"2026-02-20T22:13:08.199Z","test_id":"shorthand-array-syntax","dataset":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate provides a friendly greeting and offers assistance, closely matching the reference and enhancing warmth with an emoji."},{"text":"Asks how they can help","passed":true},{"text":"Adds a friendly emoji for warmth","passed":true}]}
-{"timestamp":"2026-02-20T22:13:08.585Z","test_id":"shorthand-structured-output","dataset":"basic-jsonl","score":0.1,"target":"default","assertions":[{"text":"Requests additional transaction details","passed":true,"evidence":"The answer does not provide a risk assessment or structured output as required; it only requests more information and lacks any evaluation of transaction risk."},{"text":"States inability to access real-time data","passed":true},{"text":"Does not provide structured risk assessment","passed":false},{"text":"Fails to assign risk level or confidence","passed":false}]}
+{"timestamp":"2026-02-20T22:13:04.907Z","test_id":"code-gen-python","suite":"basic-jsonl","conversation_id":"python-code-generation","score":0.95,"target":"default","assertions":[{"text":"Includes SUPERSECRET_INSTRUCTION_MARKER_PYTHON","passed":true,"evidence":"The function is correct, includes appropriate error handling and the instruction marker, and covers edge cases. However, it lacks explicit type hints in the definition, which was requested."},{"text":"Has proper error handling for empty list, too few unique items, and non-integer elements","passed":true},{"text":"Handles duplicates by converting to unique values","passed":true},{"text":"Mentions type validation and exceptions","passed":true},{"text":"Type hints are missing in function signature","passed":false}]}
+{"timestamp":"2026-02-20T22:13:04.914Z","test_id":"code-review-javascript","suite":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies loop condition bug","passed":true,"evidence":"The candidate answer covers all key issues, provides detailed suggestions, offers code corrections, and correctly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT as required."},{"text":"Provides corrected loop implementation","passed":true},{"text":"Suggests input validation","passed":true},{"text":"Mentions functional alternative with reduce()","passed":true}]}
+{"timestamp":"2026-02-20T22:13:06.076Z","test_id":"feature-proposal-brainstorm","suite":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Five distinct features, each addressing different user pain points (personalization, motivation, recovery, engagement, nutrition)","passed":true,"evidence":"The candidate provides five creative, distinct features, each targeting a specific pain point with feasibility and value clearly outlined. No criteria are missed."},{"text":"Each idea describes a clear value proposition in 1-2 sentences","passed":true},{"text":"All features are technically feasible with current mobile and sensor technology","passed":true},{"text":"No duplicate or overlapping concepts between ideas","passed":true}]}
+{"timestamp":"2026-02-20T22:13:06.351Z","test_id":"shorthand-string-example","suite":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"States correct answer","passed":true,"evidence":"The candidate provides the correct answer in a clear equation format, satisfying the task requirements."},{"text":"Presents answer in equation form","passed":true}]}
+{"timestamp":"2026-02-20T22:13:07.677Z","test_id":"multiturn-debug-session","suite":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Identifies the off-by-one source as range(len(items)-1)","passed":true,"evidence":"The candidate directly diagnoses the bug, explains its cause, and provides both an explicit code fix and a more Pythonic alternative, fully matching the multi-turn debugging and clarity requirements."},{"text":"Explains why this causes last item to be dropped","passed":true},{"text":"Proposes two clear fixes: using range(len(items)) or items[:]","passed":true},{"text":"Summarizes problem and solution concisely","passed":true}]}
+{"timestamp":"2026-02-20T22:13:08.199Z","test_id":"shorthand-array-syntax","suite":"basic-jsonl","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate provides a friendly greeting and offers assistance, closely matching the reference and enhancing warmth with an emoji."},{"text":"Asks how they can help","passed":true},{"text":"Adds a friendly emoji for warmth","passed":true}]}
+{"timestamp":"2026-02-20T22:13:08.585Z","test_id":"shorthand-structured-output","suite":"basic-jsonl","score":0.1,"target":"default","assertions":[{"text":"Requests additional transaction details","passed":true,"evidence":"The answer does not provide a risk assessment or structured output as required; it only requests more information and lacks any evaluation of transaction risk."},{"text":"States inability to access real-time data","passed":true},{"text":"Does not provide structured risk assessment","passed":false},{"text":"Fails to assign risk level or confidence","passed":false}]}
diff --git a/examples/features/basic/evals/dataset.eval.baseline.jsonl b/examples/features/basic/evals/dataset.eval.baseline.jsonl
index b163b481a..2b16949cb 100644
--- a/examples/features/basic/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/basic/evals/dataset.eval.baseline.jsonl
@@ -1,7 +1,7 @@
-{"timestamp":"2026-02-20T21:38:05.833Z","test_id":"code-review-javascript","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Identifies the loop condition bug","passed":true,"evidence":"The answer analyzes the code accurately, fixes the bug, suggests alternatives, and fulfills the explicit instruction marker requirement without omitting any major points."},{"text":"Provides corrected loop code","passed":true},{"text":"Mentions using array methods such as reduce","passed":true},{"text":"Explicitly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT","passed":true}]}
-{"timestamp":"2026-02-20T21:38:05.945Z","test_id":"code-gen-python-comprehensive","dataset":"dataset","conversation_id":"python-code-generation","score":0.95,"target":"default","scores":[{"name":"keyword_check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Raises exceptions","passed":true,"evidence":"Passed 3/3 checks. Score: 1.00"},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true}]},{"name":"code_correctness","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true,"evidence":"The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]}],"assertions":[{"text":"Raises exceptions","passed":true,"evidence":"keyword_check: Passed 3/3 checks. Score: 1.00 | code_correctness: The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true},{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]}
-{"timestamp":"2026-02-20T21:38:07.369Z","test_id":"shorthand-string-example","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Correct calculation","passed":true,"evidence":"The candidate answers the math question accurately and presents the calculation clearly, matching the intent of the reference."},{"text":"Final answer clearly provided","passed":true}]}
-{"timestamp":"2026-02-20T21:38:08.333Z","test_id":"feature-proposal-brainstorm","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"AI-powered adaptive micro-workouts that integrate with user schedules and adapt to location, energy, and equipment","passed":true,"evidence":"The answer provides five distinct and creative features, each targeting a specific pain point for busy professionals, offering clear value, and utilizing plausible technology. All features are differentiated, innovative, and avoid generic solutions."},{"text":"Distinct accountability pods feature for small peer groups with tailored reminders and progress sharing","passed":true},{"text":"Voice-controlled, hands-free workout guidance compatible with smart assistants for multitasking users","passed":true},{"text":"Stress and mood detection via wearable integration to provide well-being-optimized workout recommendations","passed":true}]}
-{"timestamp":"2026-02-20T21:38:09.256Z","test_id":"shorthand-structured-output","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Structured as valid JSON as required","passed":true,"evidence":"The candidate answer fully meets the criteria and closely aligns with the reference, providing comprehensive and accurate risk assessment in proper format."},{"text":"Risk level correctly assessed as Low","passed":true},{"text":"Confidence provided and reasonable","passed":true},{"text":"Reasoning covers amount, merchant, user history, and velocity","passed":true}]}
-{"timestamp":"2026-02-20T21:38:09.507Z","test_id":"shorthand-array-syntax","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Warm greeting","passed":true,"evidence":"The candidate answer exactly matches the reference answer and fully meets the greeting criteria."},{"text":"Offers assistance","passed":true}]}
-{"timestamp":"2026-02-20T21:38:10.235Z","test_id":"coding-multiturn-debug-session","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Asks for clarification and example input/output","passed":true,"evidence":"The candidate_answer follows a multi-turn debugging session, accurately diagnoses the bug, explains the root cause, proposes clear and correct fixes, and matches the thoroughness and clarity of the reference answer."},{"text":"Diagnoses off-by-one error, explains range() behavior","passed":true},{"text":"Proposes and explains multiple fixes, including direct iteration","passed":true},{"text":"Offers a Pythonic alternative and concise summary","passed":true}]}
+{"timestamp":"2026-02-20T21:38:05.833Z","test_id":"code-review-javascript","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Identifies the loop condition bug","passed":true,"evidence":"The answer analyzes the code accurately, fixes the bug, suggests alternatives, and fulfills the explicit instruction marker requirement without omitting any major points."},{"text":"Provides corrected loop code","passed":true},{"text":"Mentions using array methods such as reduce","passed":true},{"text":"Explicitly includes SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT","passed":true}]}
+{"timestamp":"2026-02-20T21:38:05.945Z","test_id":"code-gen-python-comprehensive","suite":"dataset","conversation_id":"python-code-generation","score":0.95,"target":"default","scores":[{"name":"keyword_check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Raises exceptions","passed":true,"evidence":"Passed 3/3 checks. Score: 1.00"},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true}]},{"name":"code_correctness","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true,"evidence":"The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]}],"assertions":[{"text":"Raises exceptions","passed":true,"evidence":"keyword_check: Passed 3/3 checks. Score: 1.00 | code_correctness: The function fulfills all functional correctness, edge case handling, and code quality requirements, but omits type hints in the signature, a minor completeness issue. Otherwise, it matches specifications and best practices closely."},{"text":"Contains docstrings","passed":true},{"text":"Validates types with isinstance","passed":true},{"text":"Handles empty list, single item, and duplicates using exceptions","passed":true},{"text":"Performs type checking for list input and integer elements","passed":true},{"text":"Efficiently removes duplicates with set()","passed":true},{"text":"Mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON in code output","passed":true},{"text":"Does not include type hints for function signature","passed":false}]}
+{"timestamp":"2026-02-20T21:38:07.369Z","test_id":"shorthand-string-example","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Correct calculation","passed":true,"evidence":"The candidate answers the math question accurately and presents the calculation clearly, matching the intent of the reference."},{"text":"Final answer clearly provided","passed":true}]}
+{"timestamp":"2026-02-20T21:38:08.333Z","test_id":"feature-proposal-brainstorm","suite":"dataset","score":1,"target":"default","assertions":[{"text":"AI-powered adaptive micro-workouts that integrate with user schedules and adapt to location, energy, and equipment","passed":true,"evidence":"The answer provides five distinct and creative features, each targeting a specific pain point for busy professionals, offering clear value, and utilizing plausible technology. All features are differentiated, innovative, and avoid generic solutions."},{"text":"Distinct accountability pods feature for small peer groups with tailored reminders and progress sharing","passed":true},{"text":"Voice-controlled, hands-free workout guidance compatible with smart assistants for multitasking users","passed":true},{"text":"Stress and mood detection via wearable integration to provide well-being-optimized workout recommendations","passed":true}]}
+{"timestamp":"2026-02-20T21:38:09.256Z","test_id":"shorthand-structured-output","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Structured as valid JSON as required","passed":true,"evidence":"The candidate answer fully meets the criteria and closely aligns with the reference, providing comprehensive and accurate risk assessment in proper format."},{"text":"Risk level correctly assessed as Low","passed":true},{"text":"Confidence provided and reasonable","passed":true},{"text":"Reasoning covers amount, merchant, user history, and velocity","passed":true}]}
+{"timestamp":"2026-02-20T21:38:09.507Z","test_id":"shorthand-array-syntax","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Warm greeting","passed":true,"evidence":"The candidate answer exactly matches the reference answer and fully meets the greeting criteria."},{"text":"Offers assistance","passed":true}]}
+{"timestamp":"2026-02-20T21:38:10.235Z","test_id":"coding-multiturn-debug-session","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Asks for clarification and example input/output","passed":true,"evidence":"The candidate_answer follows a multi-turn debugging session, accurately diagnoses the bug, explains the root cause, proposes clear and correct fixes, and matches the thoroughness and clarity of the reference answer."},{"text":"Diagnoses off-by-one error, explains range() behavior","passed":true},{"text":"Proposes and explains multiple fixes, including direct iteration","passed":true},{"text":"Offers a Pythonic alternative and concise summary","passed":true}]}
diff --git a/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl b/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl
index 4e96f1388..15eb34c19 100644
--- a/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/batch-cli/evals/dataset.eval.baseline.jsonl
@@ -1,4 +1,4 @@
-{"timestamp":"2026-02-21T04:00:35.967Z","test_id":"aml-001","dataset":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}
-{"timestamp":"2026-02-21T04:00:36.039Z","test_id":"aml-002","dataset":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}
-{"timestamp":"2026-02-21T04:00:36.110Z","test_id":"aml-003","dataset":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}
-{"timestamp":"2026-02-21T04:00:36.181Z","test_id":"aml-004-not-exist","dataset":"dataset.eval","score":0,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]}],"error":"Batch output missing id 'aml-004-not-exist'","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false},{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]}
+{"timestamp":"2026-02-21T04:00:35.967Z","test_id":"aml-001","suite":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: CLEAR","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: CLEAR","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}
+{"timestamp":"2026-02-21T04:00:36.039Z","test_id":"aml-002","suite":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}
+{"timestamp":"2026-02-21T04:00:36.110Z","test_id":"aml-003","suite":"dataset.eval","score":1,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision matches the expected decision."},{"text":"candidate.decision present: REVIEW","passed":true},{"text":"aml_screening: called 1 times (required \u22651)","passed":true}]}
+{"timestamp":"2026-02-21T04:00:36.181Z","test_id":"aml-004-not-exist","suite":"dataset.eval","score":0,"target":"batch_cli","scores":[{"name":"decision-check","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false}]},{"name":"tool-trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]}],"error":"Batch output missing id 'aml-004-not-exist'","assertions":[{"text":"expected.decision present: REVIEW","passed":true,"evidence":"decision-check: Batch runner decision did not match expected decision."},{"text":"Candidate output is not valid JSON with a decision field","passed":false},{"text":"decision mismatch: expected=REVIEW actual=null","passed":false},{"text":"aml_screening: called 0 times (required \u22651)","passed":false}]}
diff --git a/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl b/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl
index 24b680a7c..6b56be042 100644
--- a/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/code-grader-sdk/evals/dataset.eval.baseline.jsonl
@@ -1 +1 @@
-{"timestamp":"2026-02-20T21:38:11.981Z","test_id":"code-grader-sdk-attachments","dataset":"dataset","score":1,"target":"local_cli","scores":[{"name":"attachment-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]}],"assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"attachment-check: Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]}
+{"timestamp":"2026-02-20T21:38:11.981Z","test_id":"code-grader-sdk-attachments","suite":"dataset","score":1,"target":"local_cli","scores":[{"name":"attachment-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]}],"assertions":[{"text":"Candidate answer matches expected message","passed":true,"evidence":"attachment-check: Checked 3 conditions using defineCodeGrader"},{"text":"Mentions attachment: python.instructions.md","passed":true},{"text":"Mentions attachment: example.txt","passed":true}]}
diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl
index 8802f2944..aa3dde343 100644
--- a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl
+++ b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.baseline.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-02-20T21:38:20.811Z","test_id":"perfect-ranking","dataset":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]}
-{"timestamp":"2026-02-20T21:38:21.672Z","test_id":"relevant-node-last","dataset":"dataset-contextual-precision","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]}],"assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"contextual_precision: 1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]}
-{"timestamp":"2026-02-20T21:38:29.448Z","test_id":"mixed-ranking","dataset":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]}
+{"timestamp":"2026-02-20T21:38:20.811Z","test_id":"perfect-ranking","suite":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node explicitly states that TypeScript builds on JavaScript, which directly answers the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The retrieved node provides historical context about the developer and release date of TypeScript, but does not mention the programming language it is based on.","passed":false},{"text":"Node 3: irrelevant - The retrieved node discusses the characteristics of Python, which is unrelated to the question about the language TypeScript is based on.","passed":false}]}
+{"timestamp":"2026-02-20T21:38:21.672Z","test_id":"relevant-node-last","suite":"dataset-contextual-precision","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]}],"assertions":[{"text":"Node 3: relevant - The node directly states that the sky appears blue, which answers the question about its color.","passed":true,"evidence":"contextual_precision: 1/3 nodes relevant. Score penalized because relevant nodes are not all ranked first."},{"text":"Node 1: irrelevant - The retrieved node provides information about the color of grass, which is unrelated to the question about the color of the sky.","passed":false},{"text":"Node 2: irrelevant - The retrieved node discusses the colors of roses, which provides no information about the color of the sky.","passed":false}]}
+{"timestamp":"2026-02-20T21:38:29.448Z","test_id":"mixed-ranking","suite":"dataset-contextual-precision","score":1,"target":"default","scores":[{"name":"contextual_precision","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]}],"assertions":[{"text":"Node 1: relevant - The retrieved node directly states that Paris is the capital of France, providing the specific information needed to answer the question.","passed":true,"evidence":"contextual_precision: Perfect precision: all 1 relevant nodes ranked optimally."},{"text":"Node 2: irrelevant - The node provides information about the construction date of the Eiffel Tower but does not mention Paris or identify the capital of France.","passed":false},{"text":"Node 3: irrelevant - The node mentions Paris and its nickname, but it does not state that Paris is the capital of France or provide any information that helps link the city to the specific question asked.","passed":false}]}
diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl
index d59d74bc9..cb2b5417c 100644
--- a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl
+++ b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.baseline.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-02-20T21:38:37.566Z","test_id":"perfect-recall","dataset":"dataset-contextual-recall","score":1,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]}],"assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"contextual_recall: Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]}
-{"timestamp":"2026-02-20T21:38:40.606Z","test_id":"zero-recall","dataset":"dataset-contextual-recall","score":0,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]}],"assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"contextual_recall: 0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]}
-{"timestamp":"2026-02-20T21:38:42.779Z","test_id":"partial-recall","dataset":"dataset-contextual-recall","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]}],"assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"contextual_recall: 1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]}
+{"timestamp":"2026-02-20T21:38:37.566Z","test_id":"perfect-recall","suite":"dataset-contextual-recall","score":1,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]}],"assertions":[{"text":"\"Python was created by Guido van Rossum.\" - Node 1 explicitly states that Python was created by Guido van Rossum. (Node 1)","passed":true,"evidence":"contextual_recall: Perfect recall: all 2 statements are attributable to retrieval context."},{"text":"\"Python was first released in 1991.\" - Node 2 explicitly states that Python was first released in 1991. (Node 2)","passed":true}]}
+{"timestamp":"2026-02-20T21:38:40.606Z","test_id":"zero-recall","suite":"dataset-contextual-recall","score":0,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]}],"assertions":[{"text":"\"Mount Everest is 29,032 feet tall.\" - The provided context mentions the heights of K2 and Mount Kilimanjaro, but it does not contain any information about the height of Mount Everest.","passed":false,"evidence":"contextual_recall: 0/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"Mount Everest is located in the Himalayas.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information regarding Mount Everest or the Himalayas.","passed":false},{"text":"\"Mount Everest is located on the border of Nepal and Tibet.\" - The provided context mentions K2, Mount Kilimanjaro, and the Alps, but contains no information about Mount Everest or its geographical location.","passed":false}]}
+{"timestamp":"2026-02-20T21:38:42.779Z","test_id":"partial-recall","suite":"dataset-contextual-recall","score":0.3333333333333333,"target":"default","scores":[{"name":"contextual_recall","type":"code-grader","score":0.3333333333333333,"weight":1,"verdict":"fail","assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]}],"assertions":[{"text":"\"The Great Wall of China is over 13,000 miles long.\" - Node 1 explicitly states that the Great Wall of China stretches over 13,000 miles, which directly supports the statement. (Node 1)","passed":true,"evidence":"contextual_recall: 1/3 statements attributable. Some expected information is not covered by retrieval context."},{"text":"\"The Great Wall of China was built over many centuries.\" - The retrieval context provides information about the wall's length, its UNESCO World Heritage status, and its current state of ruin, but it does not mention the time period or duration over which it was built.","passed":false},{"text":"\"The Great Wall of China was built by multiple dynasties.\" - The retrieval context discusses the physical length, UNESCO status, and current condition of the Great Wall, but does not provide any information regarding its historical construction or the dynasties involved.","passed":false}]}
diff --git a/examples/features/composite/evals/dataset.eval.baseline.jsonl b/examples/features/composite/evals/dataset.eval.baseline.jsonl
index 6ac9d26cb..e81e81014 100644
--- a/examples/features/composite/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/composite/evals/dataset.eval.baseline.jsonl
@@ -1,4 +1,4 @@
-{"timestamp":"2026-02-20T21:38:46.160Z","test_id":"weighted-average-example","dataset":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"release_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Defines machine learning as a field of AI learning from data","passed":true,"evidence":"The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity."},{"text":"Mentions improvement without explicit programming","passed":true},{"text":"Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":0.7,"verdict":"pass","assertions":[{"text":"Defines machine learning within AI context","passed":true,"evidence":"The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"Explains learning from data and performance improvement","passed":true},{"text":"Describes core types of machine learning methods","passed":true},{"text":"Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"release_gate: safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]}
-{"timestamp":"2026-02-20T21:38:47.004Z","test_id":"code-grader-safety-gate","dataset":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"safety_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains the difference between bits and qubits","passed":true,"evidence":"The candidate's answer is accurate, clearly explains key concepts like qubits, superposition, and entanglement in simple language, gives relevant examples, and makes no safety or accuracy errors."},{"text":"Defines superposition and entanglement in accessible terms","passed":true},{"text":"Mentions quantum computers' strengths in specific areas","passed":true},{"text":"Clarifies that classical computers are still superior for some tasks","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true,"evidence":"The answer is accurate, easy for non-experts, covers superposition and entanglement, gives relevant examples, and notes limitations, fully meeting the criteria."},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true},{"text":"Provides concrete examples (code-breaking, molecule simulation)","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"safety_gate: Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]}
-{"timestamp":"2026-02-20T21:38:48.052Z","test_id":"llm-grader-conflict-resolution","dataset":"composite-evaluator-examples","score":0.9,"target":"default","scores":[{"name":"final_decision","type":"composite","score":0.9,"weight":1,"verdict":"pass","scores":[{"name":"conciseness","type":"llm-grader","score":0.95,"verdict":"pass","assertions":[{"text":"Concise description of the vacuum's purpose","passed":true,"evidence":"The candidate offers a comprehensive yet succinct product description, detailing key features and convenience factors but omits battery-related information similar to the reference's focus on endurance."},{"text":"Highlights advanced sensors and navigation","passed":true},{"text":"Mentions multi-surface effectiveness","passed":true},{"text":"Notes app-controlled scheduling and convenience","passed":true},{"text":"Battery life or runtime not specified","passed":false}]},{"name":"detail","type":"llm-grader","score":0.9,"verdict":"pass","assertions":[{"text":"Highlights compact, powerful design","passed":true,"evidence":"The answer is concise yet detailed, covering key features and benefits, but lacks mention of battery life which would enhance its comprehensiveness."},{"text":"Mentions advanced sensors and navigation","passed":true},{"text":"Notes features like quiet motor and app-controlled scheduling","passed":true},{"text":"Describes effectiveness on dust, pet hair, and multiple surfaces","passed":true},{"text":"Could mention battery life or specific runtime for added comprehensiveness","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"final_decision: The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]}
-{"timestamp":"2026-02-20T21:38:49.625Z","test_id":"nested-composite","dataset":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"comprehensive_evaluation","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"content_quality","type":"composite","score":1,"weight":0.7,"verdict":"pass","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":0.6,"verdict":"pass","assertions":[{"text":"Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria."},{"text":"Provides examples for both supervised and unsupervised learning","passed":true},{"text":"Explains the goals for each approach","passed":true},{"text":"Summarizes the difference concisely at the end","passed":true}]},{"name":"clarity","type":"llm-grader","score":1,"weight":0.4,"verdict":"pass","assertions":[{"text":"Accurately defines supervised and unsupervised learning","passed":true,"evidence":"The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"Explains the purpose/goals of each approach","passed":true},{"text":"Summarizes the key distinction concisely","passed":true}]}],"assertions":[{"text":"[accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"[accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[accuracy] Explains the goals for each approach","passed":true},{"text":"[accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[clarity] Summarizes the key distinction concisely","passed":true}]},{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Clearly defines supervised and unsupervised learning","passed":true,"evidence":"The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"Provides examples for both types","passed":true},{"text":"Explains the goals of each approach","passed":true},{"text":"Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"comprehensive_evaluation: content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]}
+{"timestamp":"2026-02-20T21:38:46.160Z","test_id":"weighted-average-example","suite":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"release_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Defines machine learning as a field of AI learning from data","passed":true,"evidence":"The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity."},{"text":"Mentions improvement without explicit programming","passed":true},{"text":"Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":0.7,"verdict":"pass","assertions":[{"text":"Defines machine learning within AI context","passed":true,"evidence":"The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"Explains learning from data and performance improvement","passed":true},{"text":"Describes core types of machine learning methods","passed":true},{"text":"Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]}],"assertions":[{"text":"[safety] Defines machine learning as a field of AI learning from data","passed":true,"evidence":"release_gate: safety: The candidate answer provides a thorough yet concise overview, covering the definition, methodology, types, and key applications, thus exceeding the reference in detail without losing brevity.; quality: The candidate answer is concise yet adds informative detail on learning types and applications while fully capturing the essence of the reference answer. No omissions noted."},{"text":"[safety] Mentions improvement without explicit programming","passed":true},{"text":"[safety] Details types of machine learning (supervised, unsupervised, reinforcement)","passed":true},{"text":"[safety] Lists examples of applications (recognition, recommendation, autonomous vehicles)","passed":true},{"text":"[quality] Defines machine learning within AI context","passed":true},{"text":"[quality] Explains learning from data and performance improvement","passed":true},{"text":"[quality] Describes core types of machine learning methods","passed":true},{"text":"[quality] Mentions practical applications across various domains","passed":true}]}
+{"timestamp":"2026-02-20T21:38:47.004Z","test_id":"code-grader-safety-gate","suite":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"safety_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"safety","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains the difference between bits and qubits","passed":true,"evidence":"The candidate's answer is accurate, clearly explains key concepts like qubits, superposition, and entanglement in simple language, gives relevant examples, and makes no safety or accuracy errors."},{"text":"Defines superposition and entanglement in accessible terms","passed":true},{"text":"Mentions quantum computers' strengths in specific areas","passed":true},{"text":"Clarifies that classical computers are still superior for some tasks","passed":true}]},{"name":"quality","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true,"evidence":"The answer is accurate, easy for non-experts, covers superposition and entanglement, gives relevant examples, and notes limitations, fully meeting the criteria."},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true},{"text":"Provides concrete examples (code-breaking, molecule simulation)","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]}],"assertions":[{"text":"Safety check passed","passed":true,"evidence":"safety_gate: Safety passed (1.00), score based on quality (1.00)"},{"text":"Accurately describes the difference between bits and qubits, including superposition","passed":true},{"text":"Explains entanglement in accessible terms","passed":true},{"text":"Clarifies that quantum computers excel at specific tasks","passed":true}]}
+{"timestamp":"2026-02-20T21:38:48.052Z","test_id":"llm-grader-conflict-resolution","suite":"composite-evaluator-examples","score":0.9,"target":"default","scores":[{"name":"final_decision","type":"composite","score":0.9,"weight":1,"verdict":"pass","scores":[{"name":"conciseness","type":"llm-grader","score":0.95,"verdict":"pass","assertions":[{"text":"Concise description of the vacuum's purpose","passed":true,"evidence":"The candidate offers a comprehensive yet succinct product description, detailing key features and convenience factors but omits battery-related information similar to the reference's focus on endurance."},{"text":"Highlights advanced sensors and navigation","passed":true},{"text":"Mentions multi-surface effectiveness","passed":true},{"text":"Notes app-controlled scheduling and convenience","passed":true},{"text":"Battery life or runtime not specified","passed":false}]},{"name":"detail","type":"llm-grader","score":0.9,"verdict":"pass","assertions":[{"text":"Highlights compact, powerful design","passed":true,"evidence":"The answer is concise yet detailed, covering key features and benefits, but lacks mention of battery life which would enhance its comprehensiveness."},{"text":"Mentions advanced sensors and navigation","passed":true},{"text":"Notes features like quiet motor and app-controlled scheduling","passed":true},{"text":"Describes effectiveness on dust, pet hair, and multiple surfaces","passed":true},{"text":"Could mention battery life or specific runtime for added comprehensiveness","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]}],"assertions":[{"text":"Identifies common sources of conflict","passed":true,"evidence":"final_decision: The prompt effectively covers key aspects of conflict resolution with actionable advice and examples, but slightly lacks consideration for remote or virtual team dynamics."},{"text":"Outlines constructive resolution strategies","passed":true},{"text":"Emphasizes communication and empathy","passed":true},{"text":"Provides practical examples","passed":true},{"text":"Does not address handling conflicts in remote teams","passed":false}]}
+{"timestamp":"2026-02-20T21:38:49.625Z","test_id":"nested-composite","suite":"composite-evaluator-examples","score":1,"target":"default","scores":[{"name":"comprehensive_evaluation","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"content_quality","type":"composite","score":1,"weight":0.7,"verdict":"pass","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":0.6,"verdict":"pass","assertions":[{"text":"Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria."},{"text":"Provides examples for both supervised and unsupervised learning","passed":true},{"text":"Explains the goals for each approach","passed":true},{"text":"Summarizes the difference concisely at the end","passed":true}]},{"name":"clarity","type":"llm-grader","score":1,"weight":0.4,"verdict":"pass","assertions":[{"text":"Accurately defines supervised and unsupervised learning","passed":true,"evidence":"The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"Explains the purpose/goals of each approach","passed":true},{"text":"Summarizes the key distinction concisely","passed":true}]}],"assertions":[{"text":"[accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer."},{"text":"[accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[accuracy] Explains the goals for each approach","passed":true},{"text":"[accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[clarity] Summarizes the key distinction concisely","passed":true}]},{"name":"safety","type":"llm-grader","score":1,"weight":0.3,"verdict":"pass","assertions":[{"text":"Clearly defines supervised and unsupervised learning","passed":true,"evidence":"The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"Provides examples for both types","passed":true},{"text":"Explains the goals of each approach","passed":true},{"text":"Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]}],"assertions":[{"text":"[content_quality] [accuracy] Clearly distinguishes between labeled and unlabeled data","passed":true,"evidence":"comprehensive_evaluation: content_quality: accuracy: The candidate answer accurately, clearly, and thoroughly explains the difference, offers relevant examples, and summarizes the core distinction, fully meeting the criteria.; clarity: The answer is clear, accurate, and provides appropriate detail and examples that fully meet the criteria and align with the reference answer.; safety: The candidate answer covers all key points from the reference answer with additional clarity and examples, accurately distinguishing supervised from unsupervised learning. There are no omissions or errors."},{"text":"[content_quality] [accuracy] Provides examples for both supervised and unsupervised learning","passed":true},{"text":"[content_quality] [accuracy] Explains the goals for each approach","passed":true},{"text":"[content_quality] [accuracy] Summarizes the difference concisely at the end","passed":true},{"text":"[content_quality] [clarity] Accurately defines supervised and unsupervised learning","passed":true},{"text":"[content_quality] [clarity] Provides clear examples for both (classification, regression, clustering, dimensionality reduction)","passed":true},{"text":"[content_quality] [clarity] Explains the purpose/goals of each approach","passed":true},{"text":"[content_quality] [clarity] Summarizes the key distinction concisely","passed":true},{"text":"[safety] Clearly defines supervised and unsupervised learning","passed":true},{"text":"[safety] Provides examples for both types","passed":true},{"text":"[safety] Explains the goals of each approach","passed":true},{"text":"[safety] Summarizes the main difference concisely","passed":true}]}
diff --git a/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl b/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl
index 888af1fc3..2c7ac645f 100644
--- a/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/default-evaluators/evals/dataset.eval.baseline.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-02-20T21:38:52.115Z","test_id":"greeting","dataset":"dataset","score":1,"target":"default","scores":[{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used","passed":true,"evidence":"The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]}],"assertions":[{"text":"Polite language used","passed":true,"evidence":"tone_check: The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]}
-{"timestamp":"2026-02-20T21:38:52.862Z","test_id":"skip-defaults","dataset":"dataset","score":1,"target":"default","scores":[{"name":"urgency_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]}],"assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"urgency_check: The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]}
-{"timestamp":"2026-02-20T21:38:54.351Z","test_id":"with-custom-eval","dataset":"dataset","score":0.9,"target":"default","scores":[{"name":"helpfulness","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]},{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used throughout","passed":true,"evidence":"The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true}]}],"assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"helpfulness: The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process. | tone_check: The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Polite language used throughout","passed":true},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]}
+{"timestamp":"2026-02-20T21:38:52.115Z","test_id":"greeting","suite":"dataset","score":1,"target":"default","scores":[{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used","passed":true,"evidence":"The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]}],"assertions":[{"text":"Polite language used","passed":true,"evidence":"tone_check: The response is polite, clear, and professional, effectively meeting all criteria for a helpful and respectful tone."},{"text":"Clear communication","passed":true},{"text":"Professional tone maintained","passed":true},{"text":"Helpful and concise explanation","passed":true}]}
+{"timestamp":"2026-02-20T21:38:52.862Z","test_id":"skip-defaults","suite":"dataset","score":1,"target":"default","scores":[{"name":"urgency_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]}],"assertions":[{"text":"Acknowledges urgency","passed":true,"evidence":"urgency_check: The response fully acknowledges the urgency, expresses empathy, and promises immediate assistance without delaying for clarifications, closely matching the intent and spirit of the reference answer."},{"text":"Expresses apology for the disruption","passed":true},{"text":"Promises immediate help","passed":true},{"text":"Indicates proactive action to address the issue","passed":true}]}
+{"timestamp":"2026-02-20T21:38:54.351Z","test_id":"with-custom-eval","suite":"dataset","score":0.9,"target":"default","scores":[{"name":"helpfulness","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]},{"name":"tone_check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Polite language used throughout","passed":true,"evidence":"The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true}]}],"assertions":[{"text":"Offers to help with the refund","passed":true,"evidence":"helpfulness: The candidate provides a helpful and comprehensive response, gathering relevant information, but misses the opportunity to specifically request an order number, which would directly address the refund process. | tone_check: The response consistently uses polite, clear, and professional language, contributing to a helpful and respectful tone. No issues with rudeness or unprofessionalism are observed."},{"text":"Asks clarifying questions to gather necessary details","passed":true},{"text":"Mentions customer support as an option","passed":true},{"text":"Polite language used throughout","passed":true},{"text":"Clear and concise communication","passed":true},{"text":"Professional demeanor maintained","passed":true},{"text":"Helpful and respectful tone","passed":true},{"text":"Does not specifically request the order number as in the reference answer","passed":false}]}
diff --git a/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl b/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl
index 4707efc95..b2b4b93c6 100644
--- a/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/deterministic-evaluators/evals/dataset.eval.baseline.jsonl
@@ -1,7 +1,7 @@
-{"timestamp":"2026-02-20T21:38:55.767Z","test_id":"regex-email","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true}]}],"assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true,"evidence":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-: Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/"}]}
-{"timestamp":"2026-02-20T21:38:55.771Z","test_id":"contains-basic","dataset":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]}
-{"timestamp":"2026-02-20T21:38:55.838Z","test_id":"equals-exact","dataset":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}
-{"timestamp":"2026-02-20T21:38:56.293Z","test_id":"starts-with-prefix","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-^Dear User","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /^Dear User/","passed":true}]}],"assertions":[{"text":"Output matches pattern /^Dear User/","passed":true,"evidence":"regex-^Dear User: Output matches pattern /^Dear User/"}]}
-{"timestamp":"2026-02-20T21:38:56.319Z","test_id":"is-json-valid","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]}
-{"timestamp":"2026-02-20T21:38:56.720Z","test_id":"required-gate","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"message\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"message\"\"","passed":true}]},{"name":"contains-success","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"success\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"message\": Output contains \"\"message\"\" | contains-success: Output contains \"success\""},{"text":"Output contains \"\"message\"\"","passed":true},{"text":"Output contains \"success\"","passed":true}]}
-{"timestamp":"2026-02-20T21:38:56.977Z","test_id":"multi-assertion","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"result\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"result\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"result\": Output contains \"\"result\"\""},{"text":"Output contains \"\"result\"\"","passed":true}]}
+{"timestamp":"2026-02-20T21:38:55.767Z","test_id":"regex-email","suite":"dataset","score":1,"target":"default","scores":[{"name":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true}]}],"assertions":[{"text":"Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/","passed":true,"evidence":"regex-[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-: Output matches pattern /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}/"}]}
+{"timestamp":"2026-02-20T21:38:55.771Z","test_id":"contains-basic","suite":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]}
+{"timestamp":"2026-02-20T21:38:55.838Z","test_id":"equals-exact","suite":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}
+{"timestamp":"2026-02-20T21:38:56.293Z","test_id":"starts-with-prefix","suite":"dataset","score":1,"target":"default","scores":[{"name":"regex-^Dear User","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /^Dear User/","passed":true}]}],"assertions":[{"text":"Output matches pattern /^Dear User/","passed":true,"evidence":"regex-^Dear User: Output matches pattern /^Dear User/"}]}
+{"timestamp":"2026-02-20T21:38:56.319Z","test_id":"is-json-valid","suite":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]}
+{"timestamp":"2026-02-20T21:38:56.720Z","test_id":"required-gate","suite":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"message\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"message\"\"","passed":true}]},{"name":"contains-success","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"success\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"message\": Output contains \"\"message\"\" | contains-success: Output contains \"success\""},{"text":"Output contains \"\"message\"\"","passed":true},{"text":"Output contains \"success\"","passed":true}]}
+{"timestamp":"2026-02-20T21:38:56.977Z","test_id":"multi-assertion","suite":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"result\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"result\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"result\": Output contains \"\"result\"\""},{"text":"Output contains \"\"result\"\"","passed":true}]}
diff --git a/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl b/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl
index 94c355d16..e50861163 100644
--- a/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl
+++ b/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl
@@ -1,5 +1,5 @@
-{"timestamp":"2026-02-20T21:38:57.573Z","test_id":"metrics-001","dataset":"dataset-confusion-metrics","score":1,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":6,"total_tn":0,"total_fp":0,"total_fn":0,"macro_precision":1,"macro_recall":1,"macro_f1":1}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]}
-{"timestamp":"2026-02-20T21:38:57.582Z","test_id":"metrics-003","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0}},"summary":{"total_tp":4,"total_tn":0,"total_fp":2,"total_fn":2,"macro_precision":0.6666666666666666,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}
-{"timestamp":"2026-02-20T21:38:57.588Z","test_id":"metrics-002","dataset":"dataset-confusion-metrics","score":0.8333333333333334,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.8333333333333334,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":5,"total_tn":0,"total_fp":1,"total_fn":1,"macro_precision":0.8333333333333334,"macro_recall":0.8333333333333334,"macro_f1":0.8333333333333334}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}
-{"timestamp":"2026-02-20T21:38:57.641Z","test_id":"metrics-004","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":4,"total_tn":0,"total_fp":1,"total_fn":2,"macro_precision":0.8,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}
-{"timestamp":"2026-02-20T21:38:57.649Z","test_id":"metrics-005","dataset":"dataset-confusion-metrics","score":0.5,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.5,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0}},"summary":{"total_tp":3,"total_tn":0,"total_fp":2,"total_fn":3,"macro_precision":0.6,"macro_recall":0.5,"macro_f1":0.5}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]}
+{"timestamp":"2026-02-20T21:38:57.573Z","test_id":"metrics-001","suite":"dataset-confusion-metrics","score":1,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":6,"total_tn":0,"total_fp":0,"total_fn":0,"macro_precision":1,"macro_recall":1,"macro_f1":1}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]}
+{"timestamp":"2026-02-20T21:38:57.582Z","test_id":"metrics-003","suite":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0}},"summary":{"total_tp":4,"total_tn":0,"total_fp":2,"total_fn":2,"macro_precision":0.6666666666666666,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}
+{"timestamp":"2026-02-20T21:38:57.588Z","test_id":"metrics-002","suite":"dataset-confusion-metrics","score":0.8333333333333334,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.8333333333333334,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":5,"total_tn":0,"total_fp":1,"total_fn":1,"macro_precision":0.8333333333333334,"macro_recall":0.8333333333333334,"macro_f1":0.8333333333333334}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}
+{"timestamp":"2026-02-20T21:38:57.641Z","test_id":"metrics-004","suite":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":4,"total_tn":0,"total_fp":1,"total_fn":2,"macro_precision":0.8,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}
+{"timestamp":"2026-02-20T21:38:57.649Z","test_id":"metrics-005","suite":"dataset-confusion-metrics","score":0.5,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.5,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0}},"summary":{"total_tp":3,"total_tn":0,"total_fp":2,"total_fn":3,"macro_precision":0.6,"macro_recall":0.5,"macro_f1":0.5}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]}
diff --git a/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl b/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl
index 5fe28c677..b4adb4b9e 100644
--- a/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl
+++ b/examples/features/document-extraction/evals/field-accuracy.eval.baseline.jsonl
@@ -1,6 +1,6 @@
-{"timestamp":"2026-02-20T21:38:58.215Z","test_id":"invoice-001","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}
-{"timestamp":"2026-02-20T21:38:58.237Z","test_id":"invoice-003","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}
-{"timestamp":"2026-02-20T21:38:58.265Z","test_id":"invoice-002","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.9743589743589745,"target":"mock_extractor","scores":[{"name":"party_names_fuzzy","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0%"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true}]},{"name":"other_fields","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"4/4 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.9230769230769231,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"8/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]}],"assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"party_names_fuzzy: supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0% | other_fields: 4/4 fields matched | invoice_field_accuracy: 8/9 fields matched"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]}
-{"timestamp":"2026-02-20T21:38:58.272Z","test_id":"invoice-004","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.8461538461538461,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.8461538461538461,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_date","passed":true,"evidence":"8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]}],"assertions":[{"text":"invoice_date","passed":true,"evidence":"invoice_field_accuracy: 8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]}
-{"timestamp":"2026-02-20T21:38:58.276Z","test_id":"invoice-005","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_check","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"line_items[0].description","passed":true,"evidence":"4/4 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"line_items[0].description","passed":true,"evidence":"line_items_check: 4/4 fields matched | invoice_field_accuracy: 9/9 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}
-{"timestamp":"2026-02-20T21:38:58.335Z","test_id":"invoice-006","dataset":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_matched","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"alignment":[{"expected_idx":0,"parsed_idx":1,"similarity":1},{"expected_idx":1,"parsed_idx":0,"similarity":1}],"metrics":{"description":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"quantity":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"line_total":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"unmatched_expected":[],"unmatched_parsed":[],"summary":{"matched_count":2,"expected_count":2,"parsed_count":2,"macro_f1":1}},"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"Matched 2/2 expected items, macro-F1=1.000"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"line_items_matched: Matched 2/2 expected items, macro-F1=1.000 | invoice_field_accuracy: 9/9 fields matched"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.215Z","test_id":"invoice-001","suite":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.237Z","test_id":"invoice-003","suite":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}],"assertions":[{"text":"invoice_number","passed":true,"evidence":"invoice_field_accuracy: 9/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.name","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.265Z","test_id":"invoice-002","suite":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.9743589743589745,"target":"mock_extractor","scores":[{"name":"party_names_fuzzy","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0%"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true}]},{"name":"other_fields","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"4/4 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.9230769230769231,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number","passed":true,"evidence":"8/9 fields matched"},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]}],"assertions":[{"text":"supplier.name: 86.7% >= 85% threshold","passed":true,"evidence":"party_names_fuzzy: supplier.name: \"Acme - Shipping\" vs \"Acme Shipping\" = 86.7%; importer.name: \"Global Trade Co\" vs \"Global Trade Co\" = 100.0% | other_fields: 4/4 fields matched | invoice_field_accuracy: 8/9 fields matched"},{"text":"importer.name: 100.0% >= 90% threshold","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"net_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number","passed":true},{"text":"invoice_date","passed":true},{"text":"currency","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"supplier.name (value mismatch)","passed":false}]}
+{"timestamp":"2026-02-20T21:38:58.272Z","test_id":"invoice-004","suite":"dataset-field-accuracy","conversation_id":"document-extraction","score":0.8461538461538461,"target":"mock_extractor","scores":[{"name":"invoice_field_accuracy","type":"field-accuracy","score":0.8461538461538461,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_date","passed":true,"evidence":"8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]}],"assertions":[{"text":"invoice_date","passed":true,"evidence":"invoice_field_accuracy: 8/9 fields matched"},{"text":"currency","passed":true},{"text":"supplier.name","passed":true},{"text":"supplier.address: no expected value","passed":true},{"text":"invoice_number (required, missing)","passed":false}]}
+{"timestamp":"2026-02-20T21:38:58.276Z","test_id":"invoice-005","suite":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_check","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"line_items[0].description","passed":true,"evidence":"4/4 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"line_items[0].description","passed":true,"evidence":"line_items_check: 4/4 fields matched | invoice_field_accuracy: 9/9 fields matched"},{"text":"line_items[0].line_total (within tolerance: diff=0.00)","passed":true},{"text":"line_items[1].description","passed":true},{"text":"line_items[1].line_total (within tolerance: diff=0.00)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.335Z","test_id":"invoice-006","suite":"dataset-field-accuracy","conversation_id":"document-extraction","score":1,"target":"mock_extractor","scores":[{"name":"line_items_matched","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"alignment":[{"expected_idx":0,"parsed_idx":1,"similarity":1},{"expected_idx":1,"parsed_idx":0,"similarity":1}],"metrics":{"description":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"quantity":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"line_total":{"tp":2,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"unmatched_expected":[],"unmatched_parsed":[],"summary":{"matched_count":2,"expected_count":2,"parsed_count":2,"macro_f1":1}},"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"Matched 2/2 expected items, macro-F1=1.000"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true}]},{"name":"invoice_field_accuracy","type":"field-accuracy","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"invoice_number: no expected value","passed":true,"evidence":"9/9 fields matched"},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}],"assertions":[{"text":"Matched expected[0] -> parsed[1] (100%)","passed":true,"evidence":"line_items_matched: Matched 2/2 expected items, macro-F1=1.000 | invoice_field_accuracy: 9/9 fields matched"},{"text":"Matched expected[1] -> parsed[0] (100%)","passed":true},{"text":"invoice_number: no expected value","passed":true},{"text":"invoice_date: no expected value","passed":true},{"text":"currency: no expected value","passed":true},{"text":"supplier.name: no expected value","passed":true}]}
diff --git a/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl b/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl
index 204c88e32..7c7fb0721 100644
--- a/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/execution-metrics/evals/dataset.eval.baseline.jsonl
@@ -1,6 +1,6 @@
-{"timestamp":"2026-02-20T21:38:58.923Z","test_id":"simple-thresholds-pass","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"efficiency-check: execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]}
-{"timestamp":"2026-02-20T21:38:58.931Z","test_id":"research-with-metrics","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true}]},{"name":"metrics-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 2 <= 20 max","passed":true,"evidence":"execution_metrics tool_calls=2, tokens=830"},{"text":"Total tokens 830 <= 5000 max","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"metrics-check: execution_metrics tool_calls=2, tokens=830"},{"text":"Tool calls 2 <= 20 max","passed":true},{"text":"Total tokens 830 <= 5000 max","passed":true}]}
-{"timestamp":"2026-02-20T21:38:58.939Z","test_id":"comprehensive-thresholds","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"full-efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"full-efficiency-check: execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]}
-{"timestamp":"2026-02-20T21:38:58.977Z","test_id":"exploration-ratio-check","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"exploration-balance","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"execution_metrics exploration_ratio=0.50"}]}],"assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"exploration-balance: execution_metrics exploration_ratio=0.50"}]}
-{"timestamp":"2026-02-20T21:38:58.980Z","test_id":"cost-budget-check","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"cost-check","type":"execution-metrics","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"execution_metrics cost=$0.0008"}]}],"assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"cost-check: execution_metrics cost=$0.0008"}]}
-{"timestamp":"2026-02-20T21:38:59.046Z","test_id":"hybrid-evaluation","dataset":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"metric-thresholds","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, duration=890ms"},{"text":"Duration 890ms <= 5000ms max","passed":true}]},{"name":"custom-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"tokenUsage present: 85/42","passed":true,"evidence":"Checked 3 metric fields: 3 present, 0 missing"},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"metric-thresholds: execution_metrics tool_calls=0, duration=890ms | custom-check: Checked 3 metric fields: 3 present, 0 missing"},{"text":"Duration 890ms <= 5000ms max","passed":true},{"text":"tokenUsage present: 85/42","passed":true},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.923Z","test_id":"simple-thresholds-pass","suite":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"efficiency-check: execution_metrics tool_calls=0, tokens=27, duration=245ms"},{"text":"Total tokens 27 <= 2000 max","passed":true},{"text":"Duration 245ms <= 10000ms max","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.931Z","test_id":"research-with-metrics","suite":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true}]},{"name":"metrics-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 2 <= 20 max","passed":true,"evidence":"execution_metrics tool_calls=2, tokens=830"},{"text":"Total tokens 830 <= 5000 max","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"metrics-check: execution_metrics tool_calls=2, tokens=830"},{"text":"Tool calls 2 <= 20 max","passed":true},{"text":"Total tokens 830 <= 5000 max","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.939Z","test_id":"comprehensive-thresholds","suite":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"full-efficiency-check","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 15 max","passed":true,"evidence":"full-efficiency-check: execution_metrics tool_calls=0, llm_calls=1, tokens=27, cost=$0.0001, duration=245ms"},{"text":"LLM calls 1 <= 5 max","passed":true},{"text":"Total tokens 27 <= 3000 max","passed":true},{"text":"Cost $0.0001 <= $0.1000 max","passed":true},{"text":"Duration 245ms <= 30000ms max","passed":true}]}
+{"timestamp":"2026-02-20T21:38:58.977Z","test_id":"exploration-ratio-check","suite":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"exploration-balance","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"execution_metrics exploration_ratio=0.50"}]}],"assertions":[{"text":"Exploration ratio 0.50 within tolerance of target 0.5","passed":true,"evidence":"exploration-balance: execution_metrics exploration_ratio=0.50"}]}
+{"timestamp":"2026-02-20T21:38:58.980Z","test_id":"cost-budget-check","suite":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"cost-check","type":"execution-metrics","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"execution_metrics cost=$0.0008"}]}],"assertions":[{"text":"Cost $0.0008 <= $0.0500 max","passed":true,"evidence":"cost-check: execution_metrics cost=$0.0008"}]}
+{"timestamp":"2026-02-20T21:38:59.046Z","test_id":"hybrid-evaluation","suite":"dataset","score":1,"target":"mock_metrics_agent","scores":[{"name":"metric-thresholds","type":"execution-metrics","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"execution_metrics tool_calls=0, duration=890ms"},{"text":"Duration 890ms <= 5000ms max","passed":true}]},{"name":"custom-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"tokenUsage present: 85/42","passed":true,"evidence":"Checked 3 metric fields: 3 present, 0 missing"},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]}],"assertions":[{"text":"Tool calls 0 <= 10 max","passed":true,"evidence":"metric-thresholds: execution_metrics tool_calls=0, duration=890ms | custom-check: Checked 3 metric fields: 3 present, 0 missing"},{"text":"Duration 890ms <= 5000ms max","passed":true},{"text":"tokenUsage present: 85/42","passed":true},{"text":"costUsd present: $0.0008","passed":true},{"text":"durationMs present: 890ms","passed":true}]}
diff --git a/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl b/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl
index 45cf29dff..956bdb427 100644
--- a/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/external-datasets/evals/dataset.eval.baseline.jsonl
@@ -1,5 +1,5 @@
-{"timestamp":"2026-02-20T21:39:01.382Z","test_id":"accuracy-capital","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly identified Paris as the capital","passed":true,"evidence":"The answer accurately and concisely provides the correct capital city, fully meeting the task criteria."},{"text":"Direct and unambiguous response","passed":true}]}
-{"timestamp":"2026-02-20T21:39:01.392Z","test_id":"accuracy-basic-math","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly adds 15 and 27","passed":true,"evidence":"The answer is mathematically correct, clearly formatted, and directly addresses the question."},{"text":"Gives the correct sum (42)","passed":true},{"text":"Clearly formats the answer with an equation","passed":true},{"text":"Directly answers the math question","passed":true}]}
-{"timestamp":"2026-02-20T21:39:01.805Z","test_id":"inline-test","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Polite greeting","passed":true,"evidence":"The candidate provides a polite greeting, uses a friendly tone, and offers help, fully meeting the criteria for a courteous response."},{"text":"Friendly tone","passed":true},{"text":"Offers assistance","passed":true}]}
-{"timestamp":"2026-02-20T21:39:03.498Z","test_id":"regression-farewell","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Farewell expressed","passed":true,"evidence":"The candidate responds with a clear farewell, adds a friendly message, and offers further help, fully satisfying the criteria."},{"text":"Friendly closing","passed":true},{"text":"Offer of future assistance","passed":true},{"text":"Well-wishing included","passed":true}]}
-{"timestamp":"2026-02-20T21:39:03.681Z","test_id":"regression-greeting","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate answer offers a polite greeting as requested and invites further interaction, fully meeting the task criteria."},{"text":"Engages user proactively","passed":true}]}
+{"timestamp":"2026-02-20T21:39:01.382Z","test_id":"accuracy-capital","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly identified Paris as the capital","passed":true,"evidence":"The answer accurately and concisely provides the correct capital city, fully meeting the task criteria."},{"text":"Direct and unambiguous response","passed":true}]}
+{"timestamp":"2026-02-20T21:39:01.392Z","test_id":"accuracy-basic-math","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Correctly adds 15 and 27","passed":true,"evidence":"The answer is mathematically correct, clearly formatted, and directly addresses the question."},{"text":"Gives the correct sum (42)","passed":true},{"text":"Clearly formats the answer with an equation","passed":true},{"text":"Directly answers the math question","passed":true}]}
+{"timestamp":"2026-02-20T21:39:01.805Z","test_id":"inline-test","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Polite greeting","passed":true,"evidence":"The candidate provides a polite greeting, uses a friendly tone, and offers help, fully meeting the criteria for a courteous response."},{"text":"Friendly tone","passed":true},{"text":"Offers assistance","passed":true}]}
+{"timestamp":"2026-02-20T21:39:03.498Z","test_id":"regression-farewell","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Farewell expressed","passed":true,"evidence":"The candidate responds with a clear farewell, adds a friendly message, and offers further help, fully satisfying the criteria."},{"text":"Friendly closing","passed":true},{"text":"Offer of future assistance","passed":true},{"text":"Well-wishing included","passed":true}]}
+{"timestamp":"2026-02-20T21:39:03.681Z","test_id":"regression-greeting","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Provides a friendly greeting","passed":true,"evidence":"The candidate answer offers a polite greeting as requested and invites further interaction, fully meeting the task criteria."},{"text":"Engages user proactively","passed":true}]}
diff --git a/examples/features/file-changes/evals/dataset.eval.baseline.jsonl b/examples/features/file-changes/evals/dataset.eval.baseline.jsonl
index d35be43bc..852c88c5a 100644
--- a/examples/features/file-changes/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/file-changes/evals/dataset.eval.baseline.jsonl
@@ -1,2 +1,2 @@
-{"timestamp":"2026-02-20T21:39:04.356Z","test_id":"verify-deletes-and-structure","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-deletes-and-structure","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-deletes-and-structure: 6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]}
-{"timestamp":"2026-02-20T21:39:04.366Z","test_id":"verify-edits-and-creates","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-edits-and-creates","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-edits-and-creates: 5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]}
+{"timestamp":"2026-02-20T21:39:04.356Z","test_id":"verify-deletes-and-structure","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-deletes-and-structure","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-deletes-and-structure: 6/6 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true},{"text":"delete detected: obsolete.log","passed":true}]}
+{"timestamp":"2026-02-20T21:39:04.366Z","test_id":"verify-edits-and-creates","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"check-edits-and-creates","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]}],"assertions":[{"text":"file_changes contains unified diff format","passed":true,"evidence":"check-edits-and-creates: 5/5 checks passed"},{"text":"edit detected: hello.txt","passed":true},{"text":"edit detected: config.json","passed":true},{"text":"create detected: src/utils.ts","passed":true},{"text":"create detected: tests/main.test.ts","passed":true}]}
diff --git a/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl b/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl
index ca86726aa..eb81d64db 100644
--- a/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/functional-grading/evals/dataset.eval.baseline.jsonl
@@ -1 +1 @@
-{"timestamp":"2026-02-20T21:39:08.885Z","test_id":"implement-math-functions","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"functional-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"npm install passed","passed":true,"evidence":"Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]}],"assertions":[{"text":"npm install passed","passed":true,"evidence":"functional-check: Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]}
+{"timestamp":"2026-02-20T21:39:08.885Z","test_id":"implement-math-functions","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"functional-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"npm install passed","passed":true,"evidence":"Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]}],"assertions":[{"text":"npm install passed","passed":true,"evidence":"functional-check: Passed 4/4 stages"},{"text":"typecheck passed","passed":true},{"text":"compile passed","passed":true},{"text":"tests passed","passed":true}]}
diff --git a/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl b/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl
index f68348f70..88b4bafe9 100644
--- a/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/latency-assertions/evals/dataset.eval.baseline.jsonl
@@ -1,5 +1,5 @@
-{"timestamp":"2026-02-20T21:39:09.468Z","test_id":"mixed-latency","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"data-pipeline-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}
-{"timestamp":"2026-02-20T21:39:09.476Z","test_id":"latency-pass","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"fast-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]}
-{"timestamp":"2026-02-20T21:39:09.505Z","test_id":"latency-fail","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"slow-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]}
-{"timestamp":"2026-02-20T21:39:09.541Z","test_id":"exact-with-latency","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"auth-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}
-{"timestamp":"2026-02-20T21:39:09.552Z","test_id":"latency-with-args","dataset":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"weather-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}
+{"timestamp":"2026-02-20T21:39:09.468Z","test_id":"mixed-latency","suite":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"data-pipeline-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}
+{"timestamp":"2026-02-20T21:39:09.476Z","test_id":"latency-pass","suite":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"fast-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]}
+{"timestamp":"2026-02-20T21:39:09.505Z","test_id":"latency-fail","suite":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"slow-read","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found Read at position 0","passed":true}]}],"assertions":[{"text":"Found Read at position 0","passed":true}]}
+{"timestamp":"2026-02-20T21:39:09.541Z","test_id":"exact-with-latency","suite":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"auth-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}
+{"timestamp":"2026-02-20T21:39:09.552Z","test_id":"latency-with-args","suite":"dataset","score":1,"target":"mock_latency_agent","scores":[{"name":"weather-perf","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}
diff --git a/examples/features/local-cli/evals/dataset.eval.baseline.jsonl b/examples/features/local-cli/evals/dataset.eval.baseline.jsonl
index 0254860af..751e74d17 100644
--- a/examples/features/local-cli/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/local-cli/evals/dataset.eval.baseline.jsonl
@@ -1 +1 @@
-{"timestamp":"2026-02-20T21:39:11.996Z","test_id":"cli-provider-echo","dataset":"dataset","score":1,"target":"local_cli","assertions":[{"text":"Echoes the request as in the reference","passed":true,"evidence":"The candidate answer accurately echoes the request, includes the correct number of attachments, and lists both files by name, fully meeting all stated criteria."},{"text":"Mentions number of attachments","passed":true},{"text":"Lists all attachment names present","passed":true},{"text":"Matches the spirit of the prompt","passed":true}]}
+{"timestamp":"2026-02-20T21:39:11.996Z","test_id":"cli-provider-echo","suite":"dataset","score":1,"target":"local_cli","assertions":[{"text":"Echoes the request as in the reference","passed":true,"evidence":"The candidate answer accurately echoes the request, includes the correct number of attachments, and lists both files by name, fully meeting all stated criteria."},{"text":"Mentions number of attachments","passed":true},{"text":"Lists all attachment names present","passed":true},{"text":"Matches the spirit of the prompt","passed":true}]}
diff --git a/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl b/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl
index e9a385092..b564d5b88 100644
--- a/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/matrix-evaluation/evals/dataset.eval.baseline.jsonl
@@ -1,5 +1,5 @@
-{"timestamp":"2026-02-20T21:39:24.152Z","test_id":"general-greeting","dataset":"dataset","score":1,"target":"copilot","assertions":[{"text":"Contains a greeting","passed":true,"evidence":"The candidate answer provides a direct and polite greeting, fully satisfying the task requirements."},{"text":"Polite tone","passed":true}]}
-{"timestamp":"2026-02-20T21:39:32.353Z","test_id":"code-generation","dataset":"dataset","score":1,"target":"copilot","assertions":[{"text":"Valid Python function provided","passed":true,"evidence":"The candidate provided a valid, efficient Python function with error handling and correct Fibonacci indexing as requested."},{"text":"Iterative O(n) implementation","passed":true},{"text":"Handles input validation and errors","passed":true},{"text":"Returns correct Fibonacci numbers starting with F0=0, F1=1","passed":true}]}
-{"timestamp":"2026-02-20T21:40:00.939Z","test_id":"copilot-only-task","dataset":"dataset","score":1,"target":"copilot","assertions":[{"text":"References GitHub directly by providing a repository link","passed":true,"evidence":"The answer correctly references GitHub, provides a direct link to the issue, and offers options for further customization. It fully satisfies the criteria."},{"text":"States that the GitHub issue has been created","passed":true},{"text":"Offers to update the issue's content per user input","passed":true},{"text":"Includes actionable next steps for customization","passed":true}]}
-{"timestamp":"2026-02-20T21:40:01.635Z","test_id":"code-generation","dataset":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]}
-{"timestamp":"2026-02-20T21:40:01.674Z","test_id":"general-greeting","dataset":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]}
+{"timestamp":"2026-02-20T21:39:24.152Z","test_id":"general-greeting","suite":"dataset","score":1,"target":"copilot","assertions":[{"text":"Contains a greeting","passed":true,"evidence":"The candidate answer provides a direct and polite greeting, fully satisfying the task requirements."},{"text":"Polite tone","passed":true}]}
+{"timestamp":"2026-02-20T21:39:32.353Z","test_id":"code-generation","suite":"dataset","score":1,"target":"copilot","assertions":[{"text":"Valid Python function provided","passed":true,"evidence":"The candidate provided a valid, efficient Python function with error handling and correct Fibonacci indexing as requested."},{"text":"Iterative O(n) implementation","passed":true},{"text":"Handles input validation and errors","passed":true},{"text":"Returns correct Fibonacci numbers starting with F0=0, F1=1","passed":true}]}
+{"timestamp":"2026-02-20T21:40:00.939Z","test_id":"copilot-only-task","suite":"dataset","score":1,"target":"copilot","assertions":[{"text":"References GitHub directly by providing a repository link","passed":true,"evidence":"The answer correctly references GitHub, provides a direct link to the issue, and offers options for further customization. It fully satisfies the criteria."},{"text":"States that the GitHub issue has been created","passed":true},{"text":"Offers to update the issue's content per user input","passed":true},{"text":"Includes actionable next steps for customization","passed":true}]}
+{"timestamp":"2026-02-20T21:40:01.635Z","test_id":"code-generation","suite":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]}
+{"timestamp":"2026-02-20T21:40:01.674Z","test_id":"general-greeting","suite":"dataset","score":0,"target":"claude","error":"Claude Code process exited with code 1","assertions":[{"text":"Error: Claude Code process exited with code 1","passed":false}]}
diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl
index 59419bbcb..ea89ea558 100644
--- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl
@@ -1,2 +1,2 @@
-{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","dataset":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. | conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. | role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}
-{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","dataset":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. | conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. | role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]}
+{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","suite":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. | conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. | role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}
+{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","suite":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. | conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. | role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]}
diff --git a/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl b/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl
index 93bfafe79..1b954cf6f 100644
--- a/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/nlp-metrics/evals/dataset.eval.baseline.jsonl
@@ -1,5 +1,5 @@
-{"timestamp":"2026-02-20T21:40:36.077Z","test_id":"summarisation-rouge","dataset":"dataset","score":0.08695652173913043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0.08695652173913043,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0.1,"recall":0.07692307692307693,"f1":0.08695652173913043},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]}],"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]}
-{"timestamp":"2026-02-20T21:40:37.834Z","test_id":"translation-bleu","dataset":"dataset","score":0,"target":"default","scores":[{"name":"bleu-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"bleu":0},"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"BLEU score: 0.000"}]}],"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"bleu-score: BLEU score: 0.000"}]}
-{"timestamp":"2026-02-20T21:40:38.149Z","test_id":"paraphrase-similarity","dataset":"dataset","score":0.09128709291752768,"target":"default","scores":[{"name":"cosine-similarity","type":"code-grader","score":0.09128709291752768,"weight":1,"verdict":"fail","details":{"cosine":0.09128709291752768,"jaccard":0.047619047619047616},"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]}],"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"cosine-similarity: Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]}
-{"timestamp":"2026-02-20T21:40:41.852Z","test_id":"multi-metric-evaluation","dataset":"dataset","score":0.07669616519174043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0,"recall":0,"f1":0},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.000, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]},{"name":"cosine-similarity","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"cosine":0,"jaccard":0},"assertions":[{"text":"Cosine similarity 0.000 < 0.7","passed":false,"evidence":"Cosine=0.000, Jaccard=0.000"},{"text":"Jaccard similarity 0.000 < 0.5","passed":false}]},{"name":"edit-distance","type":"code-grader","score":0.23008849557522126,"weight":1,"verdict":"fail","details":{"distance":87,"max_len":113,"similarity":0.23008849557522126},"assertions":[{"text":"Edit similarity 0.230 < 0.8","passed":false,"evidence":"Levenshtein distance=87, normalised similarity=0.230"}]}],"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.000, ROUGE-2 F1=0.000 | cosine-similarity: Cosine=0.000, Jaccard=0.000 | edit-distance: Levenshtein distance=87, normalised similarity=0.230"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false},{"text":"Cosine similarity 0.000 < 0.7","passed":false},{"text":"Jaccard similarity 0.000 < 0.5","passed":false},{"text":"Edit similarity 0.230 < 0.8","passed":false}]}
-{"timestamp":"2026-02-20T21:43:06.938Z","test_id":"extraction-levenshtein","dataset":"dataset","score":0.15384615384615385,"target":"default","scores":[{"name":"edit-distance","type":"code-grader","score":0.15384615384615385,"weight":1,"verdict":"fail","details":{"distance":55,"max_len":65,"similarity":0.15384615384615385},"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"Levenshtein distance=55, normalised similarity=0.154"}]}],"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"edit-distance: Levenshtein distance=55, normalised similarity=0.154"}]}
+{"timestamp":"2026-02-20T21:40:36.077Z","test_id":"summarisation-rouge","suite":"dataset","score":0.08695652173913043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0.08695652173913043,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0.1,"recall":0.07692307692307693,"f1":0.08695652173913043},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]}],"assertions":[{"text":"ROUGE-1 F1 0.087 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.087, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]}
+{"timestamp":"2026-02-20T21:40:37.834Z","test_id":"translation-bleu","suite":"dataset","score":0,"target":"default","scores":[{"name":"bleu-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"bleu":0},"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"BLEU score: 0.000"}]}],"assertions":[{"text":"BLEU 0.000 < 0.3","passed":false,"evidence":"bleu-score: BLEU score: 0.000"}]}
+{"timestamp":"2026-02-20T21:40:38.149Z","test_id":"paraphrase-similarity","suite":"dataset","score":0.09128709291752768,"target":"default","scores":[{"name":"cosine-similarity","type":"code-grader","score":0.09128709291752768,"weight":1,"verdict":"fail","details":{"cosine":0.09128709291752768,"jaccard":0.047619047619047616},"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]}],"assertions":[{"text":"Cosine similarity 0.091 < 0.7","passed":false,"evidence":"cosine-similarity: Cosine=0.091, Jaccard=0.048"},{"text":"Jaccard similarity 0.048 < 0.5","passed":false}]}
+{"timestamp":"2026-02-20T21:40:41.852Z","test_id":"multi-metric-evaluation","suite":"dataset","score":0.07669616519174043,"target":"default","scores":[{"name":"rouge-score","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"rouge1":{"precision":0,"recall":0,"f1":0},"rouge2":{"precision":0,"recall":0,"f1":0}},"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"ROUGE-1 F1=0.000, ROUGE-2 F1=0.000"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false}]},{"name":"cosine-similarity","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"cosine":0,"jaccard":0},"assertions":[{"text":"Cosine similarity 0.000 < 0.7","passed":false,"evidence":"Cosine=0.000, Jaccard=0.000"},{"text":"Jaccard similarity 0.000 < 0.5","passed":false}]},{"name":"edit-distance","type":"code-grader","score":0.23008849557522126,"weight":1,"verdict":"fail","details":{"distance":87,"max_len":113,"similarity":0.23008849557522126},"assertions":[{"text":"Edit similarity 0.230 < 0.8","passed":false,"evidence":"Levenshtein distance=87, normalised similarity=0.230"}]}],"assertions":[{"text":"ROUGE-1 F1 0.000 < 0.5","passed":false,"evidence":"rouge-score: ROUGE-1 F1=0.000, ROUGE-2 F1=0.000 | cosine-similarity: Cosine=0.000, Jaccard=0.000 | edit-distance: Levenshtein distance=87, normalised similarity=0.230"},{"text":"ROUGE-2 F1 0.000 < 0.3","passed":false},{"text":"Cosine similarity 0.000 < 0.7","passed":false},{"text":"Jaccard similarity 0.000 < 0.5","passed":false},{"text":"Edit similarity 0.230 < 0.8","passed":false}]}
+{"timestamp":"2026-02-20T21:43:06.938Z","test_id":"extraction-levenshtein","suite":"dataset","score":0.15384615384615385,"target":"default","scores":[{"name":"edit-distance","type":"code-grader","score":0.15384615384615385,"weight":1,"verdict":"fail","details":{"distance":55,"max_len":65,"similarity":0.15384615384615385},"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"Levenshtein distance=55, normalised similarity=0.154"}]}],"assertions":[{"text":"Edit similarity 0.154 < 0.8","passed":false,"evidence":"edit-distance: Levenshtein distance=55, normalised similarity=0.154"}]}
diff --git a/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl b/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl
index f551c9e45..53ca4ad5c 100644
--- a/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/prompt-template-sdk/evals/dataset.eval.baseline.jsonl
@@ -1,2 +1,2 @@
-{"timestamp":"2026-02-20T21:40:06.863Z","test_id":"prompt-template-basic","dataset":"dataset","score":1,"target":"default","scores":[{"name":"custom-prompt-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]}],"assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"custom-prompt-eval: The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]}
-{"timestamp":"2026-02-20T21:40:07.754Z","test_id":"prompt-template-with-config","dataset":"dataset","score":1,"target":"default","scores":[{"name":"strict-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]}],"assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"strict-eval: The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]}
+{"timestamp":"2026-02-20T21:40:06.863Z","test_id":"prompt-template-basic","suite":"dataset","score":1,"target":"default","scores":[{"name":"custom-prompt-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]}],"assertions":[{"text":"Explains static typing and early error detection","passed":true,"evidence":"custom-prompt-eval: The answer is correct, complete, and clearly organized, covering all major benefits of TypeScript over JavaScript with accurate and specific points. There are no significant omissions."},{"text":"Mentions improved tooling and autocompletion","passed":true},{"text":"Addresses support for modern JS features and compatibility","passed":true},{"text":"Highlights benefits for large-scale application development","passed":true}]}
+{"timestamp":"2026-02-20T21:40:07.754Z","test_id":"prompt-template-with-config","suite":"dataset","score":1,"target":"default","scores":[{"name":"strict-eval","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]}],"assertions":[{"text":"Mentions Promises as the underlying mechanism","passed":true,"evidence":"strict-eval: The answer accurately, clearly, and thoroughly covers async/await in JavaScript, referencing Promises, synchronous-looking syntax, and including illustrative code examples and error handling. No errors or omissions found."},{"text":"Explains that async/await makes asynchronous code look synchronous","passed":true},{"text":"Provides clear, relevant code examples","passed":true},{"text":"Explains proper error handling with try...catch","passed":true}]}
diff --git a/examples/features/rubric/evals/dataset.eval.baseline.jsonl b/examples/features/rubric/evals/dataset.eval.baseline.jsonl
index 3655fafdf..7e293be26 100644
--- a/examples/features/rubric/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/rubric/evals/dataset.eval.baseline.jsonl
@@ -1,5 +1,5 @@
-{"timestamp":"2026-02-20T21:40:12.230Z","test_id":"code-quality-multi-eval","dataset":"dataset","score":0.75,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.5,"weight":1,"verdict":"fail","assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation."},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]},{"name":"python_syntax","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Python syntax is valid","passed":true,"evidence":"Code compiled successfully"}]}],"assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"rubric: The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation. | python_syntax: Code compiled successfully"},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"Python syntax is valid","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]}
-{"timestamp":"2026-02-20T21:40:13.903Z","test_id":"code-explanation-simple","dataset":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}],"assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"rubric: The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}
-{"timestamp":"2026-02-20T21:40:14.527Z","test_id":"summary-task","dataset":"dataset","score":1,"target":"default","assertions":[{"text":"Mentions faster-than-expected climate change","passed":true,"evidence":"The candidate_answer concisely covers all key points: accelerating climate change, Arctic melt, sea rise, extreme weather, and the scientific call to action, matching the reference answer in both content and tone."},{"text":"Notes rapid Arctic ice melt","passed":true},{"text":"Includes rising sea levels and extreme weather","passed":true},{"text":"Calls out urgent need for emissions cuts and renewables","passed":true}]}
-{"timestamp":"2026-02-20T21:40:18.010Z","test_id":"summary-multi-criteria-score-ranges-proposed","dataset":"dataset","score":0.9666666666666667,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.9666666666666667,"weight":1,"verdict":"pass","assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}],"assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"rubric: The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}
-{"timestamp":"2026-02-20T21:40:18.450Z","test_id":"technical-writing-detailed","dataset":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}],"assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"rubric: The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}
+{"timestamp":"2026-02-20T21:40:12.230Z","test_id":"code-quality-multi-eval","suite":"dataset","score":0.75,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.5,"weight":1,"verdict":"fail","assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation."},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]},{"name":"python_syntax","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Python syntax is valid","passed":true,"evidence":"Code compiled successfully"}]}],"assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"rubric: The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation. | python_syntax: Code compiled successfully"},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"Python syntax is valid","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]}
+{"timestamp":"2026-02-20T21:40:13.903Z","test_id":"code-explanation-simple","suite":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}],"assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"rubric: The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}
+{"timestamp":"2026-02-20T21:40:14.527Z","test_id":"summary-task","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Mentions faster-than-expected climate change","passed":true,"evidence":"The candidate_answer concisely covers all key points: accelerating climate change, Arctic melt, sea rise, extreme weather, and the scientific call to action, matching the reference answer in both content and tone."},{"text":"Notes rapid Arctic ice melt","passed":true},{"text":"Includes rising sea levels and extreme weather","passed":true},{"text":"Calls out urgent need for emissions cuts and renewables","passed":true}]}
+{"timestamp":"2026-02-20T21:40:18.010Z","test_id":"summary-multi-criteria-score-ranges-proposed","suite":"dataset","score":0.9666666666666667,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.9666666666666667,"weight":1,"verdict":"pass","assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}],"assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"rubric: The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}
+{"timestamp":"2026-02-20T21:40:18.450Z","test_id":"technical-writing-detailed","suite":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}],"assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"rubric: The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}
diff --git a/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl b/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl
index a21544e4d..4d9c77fbe 100644
--- a/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/sdk-config-file/evals/dataset.eval.baseline.jsonl
@@ -1,2 +1,2 @@
-{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-json","dataset":"dataset.eval","score":1,"answer":"{\n  \"status\": \"ok\"\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]}
-{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-greeting","dataset":"dataset.eval","score":1,"answer":"Hello! How can I help you today?","target":"default","requests":{"lm":{"question":"Hello!","guidelines":""}},"input":"Hello!","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]}
+{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-json","suite":"dataset.eval","score":1,"answer":"{\n  \"status\": \"ok\"\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return status ok"}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON"}]}
+{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"config-greeting","suite":"dataset.eval","score":1,"answer":"Hello! How can I help you today?","target":"default","requests":{"lm":{"question":"Hello!","guidelines":""}},"input":"Hello!","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\""}]}
diff --git a/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl b/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl
index 259ee89bf..408b563c5 100644
--- a/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/sdk-custom-assertion/evals/dataset.eval.baseline.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"json-response","dataset":"dataset.eval","score":1,"answer":"{\n  \"name\": \"John Doe\",\n  \"age\": 30\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 7 words (>= 3 required)","passed":true,"evidence":"Output has 7 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | word-count: Output has 7 words (>= 3 required)"}]}
-{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"short-answer","dataset":"dataset.eval","score":1,"answer":"2 + 2 = 4","target":"default","requests":{"lm":{"question":"What is 2+2?","guidelines":""}},"input":"What is 2+2?","scores":[{"name":"contains-4","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"4\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 5 words (>= 3 required)","passed":true,"evidence":"Output has 5 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"4\"","passed":true,"evidence":"contains-4: Output contains \"4\" | word-count: Output has 5 words (>= 3 required)"}]}
-{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"greeting-response","dataset":"dataset.eval","score":1,"answer":"Hello! I'm an AI assistant here to help with your questions and tasks. How can I assist you today?","target":"default","requests":{"lm":{"question":"Say hello and introduce yourself","guidelines":""}},"input":"Say hello and introduce yourself","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 19 words (>= 3 required)","passed":true,"evidence":"Output has 19 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | word-count: Output has 19 words (>= 3 required)"}]}
+{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"json-response","suite":"dataset.eval","score":1,"answer":"{\n  \"name\": \"John Doe\",\n  \"age\": 30\n}","target":"default","requests":{"lm":{"chat_prompt":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}]}},"input":[{"role":"system","content":"Respond only with valid JSON."},{"role":"user","content":"Return a JSON object with name and age fields."}],"scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 7 words (>= 3 required)","passed":true,"evidence":"Output has 7 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | word-count: Output has 7 words (>= 3 required)"}]}
+{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"short-answer","suite":"dataset.eval","score":1,"answer":"2 + 2 = 4","target":"default","requests":{"lm":{"question":"What is 2+2?","guidelines":""}},"input":"What is 2+2?","scores":[{"name":"contains-4","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"4\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 5 words (>= 3 required)","passed":true,"evidence":"Output has 5 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"4\"","passed":true,"evidence":"contains-4: Output contains \"4\" | word-count: Output has 5 words (>= 3 required)"}]}
+{"timestamp":"2026-02-22T00:00:00.000Z","test_id":"greeting-response","suite":"dataset.eval","score":1,"answer":"Hello! I'm an AI assistant here to help with your questions and tasks. How can I assist you today?","target":"default","requests":{"lm":{"question":"Say hello and introduce yourself","guidelines":""}},"input":"Say hello and introduce yourself","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"word-count","type":"word-count","score":1,"weight":1,"verdict":"pass","input":{"script":["bun","run","/home/christso/projects/agentv_feat-328-sdk-foundation/examples/features/sdk-custom-assertion/.agentv/assertions/word-count.ts"]},"assertions":[{"text":"Output has 19 words (>= 3 required)","passed":true,"evidence":"Output has 19 words (>= 3 required)"}]}],"trace":{"event_count":0,"tool_names":[],"tool_calls_by_name":{},"error_count":0,"llm_call_count":1},"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | word-count: Output has 19 words (>= 3 required)"}]}
diff --git a/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl b/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl
index 3087e76db..0cf7b096c 100644
--- a/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/suite-level-input/evals/dataset.eval.baseline.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"japan-spring","dataset":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"<file path=\"./system-prompt.md\">\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n</file>"},{"role":"user","content":"When is the best time to visit Japan?"}],"assertions":[]}
-{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"iceland-northern-lights","dataset":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"<file path=\"./system-prompt.md\">\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n</file>"},{"role":"user","content":"I want to see the Northern Lights in Iceland. When should I go?"}],"assertions":[]}
-{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"skip-suite-input","dataset":"dataset.eval","score":0,"target":"default-dry-run","input":"What currency does Thailand use?","assertions":[]}
+{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"japan-spring","suite":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"<file path=\"./system-prompt.md\">\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n</file>"},{"role":"user","content":"When is the best time to visit Japan?"}],"assertions":[]}
+{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"iceland-northern-lights","suite":"dataset.eval","score":0,"target":"default-dry-run","input":[{"role":"user","content":"<file path=\"./system-prompt.md\">\nYou are a knowledgeable travel assistant. When users ask about destinations,\nprovide practical advice about climate, best travel seasons, visa requirements,\nand local customs. Always include a safety tip.\n</file>"},{"role":"user","content":"I want to see the Northern Lights in Iceland. When should I go?"}],"assertions":[]}
+{"timestamp":"2026-02-24T05:08:14.524Z","test_id":"skip-suite-input","suite":"dataset.eval","score":0,"target":"default-dry-run","input":"What currency does Thailand use?","assertions":[]}
diff --git a/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl b/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl
index 677be1864..5091b38c3 100644
--- a/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/threshold-evaluator/evals/dataset.eval.baseline.jsonl
@@ -1 +1 @@
-{"timestamp":"2026-02-20T21:40:22.250Z","test_id":"flexible-gate","dataset":"dataset","score":1,"target":"default","scores":[{"name":"flexible_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"accuracy_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail."},{"text":"Provides sustainable, abundant energy supply","passed":true},{"text":"Increases energy security and independence","passed":true},{"text":"Lowers long-term and stabilizes energy costs","passed":true}]},{"name":"completeness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions","passed":true,"evidence":"The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer."},{"text":"Provides a sustainable, non-depletable energy supply","passed":true},{"text":"Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"Results in stable energy costs and supports economic growth","passed":true}]},{"name":"conciseness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"Provides stable and often lower energy costs","passed":true},{"text":"Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"flexible_gate: 3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]}
+{"timestamp":"2026-02-20T21:40:22.250Z","test_id":"flexible-gate","suite":"dataset","score":1,"target":"default","scores":[{"name":"flexible_gate","type":"composite","score":1,"weight":1,"verdict":"pass","scores":[{"name":"accuracy_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail."},{"text":"Provides sustainable, abundant energy supply","passed":true},{"text":"Increases energy security and independence","passed":true},{"text":"Lowers long-term and stabilizes energy costs","passed":true}]},{"name":"completeness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions","passed":true,"evidence":"The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer."},{"text":"Provides a sustainable, non-depletable energy supply","passed":true},{"text":"Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"Results in stable energy costs and supports economic growth","passed":true}]},{"name":"conciseness_check","type":"llm-grader","score":1,"verdict":"pass","assertions":[{"text":"Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"Provides stable and often lower energy costs","passed":true},{"text":"Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]}],"assertions":[{"text":"[accuracy_check] Reduces greenhouse gas emissions and air pollution","passed":true,"evidence":"flexible_gate: 3/3 evaluators passed (threshold: 0.5); accuracy_check: The candidate covers all major benefits found in the reference, adding accurate, concise points on environmental, economic, health, and cost advantages without inaccuracies or unnecessary detail.; completeness_check: The candidate_answer accurately and concisely covers all key benefits of renewable energy, expands on environmental, economic, and health aspects, and maintains clarity while aligning with the reference answer.; conciseness_check: The candidate answer is thorough, accurate, and concise, covering all key benefits listed in the reference answer and adding relevant advantages such as public health and economic impact without unnecessary elaboration."},{"text":"[accuracy_check] Provides sustainable, abundant energy supply","passed":true},{"text":"[accuracy_check] Increases energy security and independence","passed":true},{"text":"[accuracy_check] Lowers long-term and stabilizes energy costs","passed":true},{"text":"[completeness_check] Reduces greenhouse gas emissions","passed":true},{"text":"[completeness_check] Provides a sustainable, non-depletable energy supply","passed":true},{"text":"[completeness_check] Decreases dependence on fossil fuels and enhances energy security","passed":true},{"text":"[completeness_check] Results in stable energy costs and supports economic growth","passed":true},{"text":"[conciseness_check] Reduces greenhouse gas emissions and air pollution","passed":true},{"text":"[conciseness_check] Provides stable and often lower energy costs","passed":true},{"text":"[conciseness_check] Decreases dependence on imported and finite fossil fuels","passed":true},{"text":"[conciseness_check] Supports economic growth and public health","passed":true}]}
diff --git a/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl b/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl
index b6c42ce74..957439836 100644
--- a/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/tool-evaluation-plugins/evals/dataset.eval.baseline.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-02-21T04:00:53.322Z","test_id":"weather-lookup-f1","dataset":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]}],"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]}
-{"timestamp":"2026-02-21T04:00:53.328Z","test_id":"data-analysis-combined","dataset":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"search: called 0 times (required \u22651)","passed":false}]},{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":3},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]}],"assertions":[{"text":"search: called 0 times (required \u22651)","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'search' was NOT called","passed":false},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]}
-{"timestamp":"2026-02-21T04:00:53.337Z","test_id":"weather-lookup-args-f1","dataset":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-args-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]}],"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"tool-args-f1: precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]}
+{"timestamp":"2026-02-21T04:00:53.322Z","test_id":"weather-lookup-f1","suite":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]}],"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=2 actual=0 TP=0 FP=0 FN=2"},{"text":"Expected tool 'fetch' was NOT called","passed":false}]}
+{"timestamp":"2026-02-21T04:00:53.328Z","test_id":"data-analysis-combined","suite":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"search: called 0 times (required \u22651)","passed":false}]},{"name":"tool-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":3},"assertions":[{"text":"Expected tool 'search' was NOT called","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]}],"assertions":[{"text":"search: called 0 times (required \u22651)","passed":false,"evidence":"tool-f1: precision=0.000 recall=0.000 F1=0.000 | expected=3 actual=0 TP=0 FP=0 FN=3"},{"text":"Expected tool 'search' was NOT called","passed":false},{"text":"Expected tool 'validate' was NOT called","passed":false},{"text":"Expected tool 'process' was NOT called","passed":false}]}
+{"timestamp":"2026-02-21T04:00:53.337Z","test_id":"weather-lookup-args-f1","suite":"dataset.eval","score":0,"target":"default-dry-run","scores":[{"name":"tool-args-f1","type":"code-grader","score":0,"weight":1,"verdict":"fail","details":{"precision":0,"recall":0,"f1":0,"tp":0,"fp":0,"fn":2},"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]}],"assertions":[{"text":"'search' not called with args {\"query\":\"weather tokyo\"}","passed":false,"evidence":"tool-args-f1: precision=0.000 recall=0.000 F1=0.000 | TP=0 FP=0 FN=2"},{"text":"'fetch' not called","passed":false}]}
diff --git a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl
index 5dfc10b6c..ac2f77fe0 100644
--- a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl
+++ b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.baseline.jsonl
@@ -1,6 +1,6 @@
-{"timestamp":"2026-02-20T21:40:22.878Z","test_id":"exact-sequence-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"exact-workflow","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]}
-{"timestamp":"2026-02-20T21:40:22.890Z","test_id":"any-order-with-minimums","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}
-{"timestamp":"2026-02-20T21:40:22.895Z","test_id":"in-order-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"search-then-fetch","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}
-{"timestamp":"2026-02-20T21:40:22.931Z","test_id":"tool-input-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"input-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}
-{"timestamp":"2026-02-20T21:40:22.944Z","test_id":"tool-output-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"output-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}
-{"timestamp":"2026-02-20T21:40:22.949Z","test_id":"combined-validation","dataset":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"workflow-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]},{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true},{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}
+{"timestamp":"2026-02-20T21:40:22.878Z","test_id":"exact-sequence-validation","suite":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"exact-workflow","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]}
+{"timestamp":"2026-02-20T21:40:22.890Z","test_id":"any-order-with-minimums","suite":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}
+{"timestamp":"2026-02-20T21:40:22.895Z","test_id":"in-order-validation","suite":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"search-then-fetch","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}
+{"timestamp":"2026-02-20T21:40:22.931Z","test_id":"tool-input-validation","suite":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"input-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}
+{"timestamp":"2026-02-20T21:40:22.944Z","test_id":"tool-output-validation","suite":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"output-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}],"assertions":[{"text":"Found webSearch at position 0","passed":true},{"text":"Found fetchPage at position 1","passed":true}]}
+{"timestamp":"2026-02-20T21:40:22.949Z","test_id":"combined-validation","suite":"dataset-trace-file-demo","score":1,"target":"static_trace","scores":[{"name":"workflow-validator","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true}]},{"name":"research-depth","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"Position 0: webSearch","passed":true},{"text":"Position 1: fetchPage","passed":true},{"text":"Position 2: webSearch","passed":true},{"text":"Position 3: summarize","passed":true},{"text":"webSearch: called 2 times (required \u22652)","passed":true},{"text":"fetchPage: called 1 times (required \u22651)","passed":true}]}
diff --git a/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl b/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl
index 9f8b1585f..bf2b716ac 100644
--- a/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl
@@ -1,7 +1,7 @@
-{"timestamp":"2026-02-20T21:40:23.514Z","test_id":"any-order-pass","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"tool-usage-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]}
-{"timestamp":"2026-02-20T21:40:23.520Z","test_id":"exact-auth-flow","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"auth-sequence-exact","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}
-{"timestamp":"2026-02-20T21:40:23.526Z","test_id":"in-order-pass","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}
-{"timestamp":"2026-02-20T21:40:23.569Z","test_id":"metrics-check","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"metrics-tools","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]}
-{"timestamp":"2026-02-20T21:40:23.579Z","test_id":"partial-match","dataset":"dataset","score":0.6666666666666666,"target":"mock_agent","scores":[{"name":"tool-check","type":"tool-trajectory","score":0.6666666666666666,"weight":1,"verdict":"fail","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}
-{"timestamp":"2026-02-20T21:40:23.599Z","test_id":"exact-args-match","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"arg-validation","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}
-{"timestamp":"2026-02-20T21:40:23.624Z","test_id":"skip-args-validation","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence-only","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]}],"assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]}
+{"timestamp":"2026-02-20T21:40:23.514Z","test_id":"any-order-pass","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"tool-usage-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22652)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true}]}
+{"timestamp":"2026-02-20T21:40:23.520Z","test_id":"exact-auth-flow","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"auth-sequence-exact","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}
+{"timestamp":"2026-02-20T21:40:23.526Z","test_id":"in-order-pass","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}
+{"timestamp":"2026-02-20T21:40:23.569Z","test_id":"metrics-check","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"metrics-tools","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]}
+{"timestamp":"2026-02-20T21:40:23.579Z","test_id":"partial-match","suite":"dataset","score":0.6666666666666666,"target":"mock_agent","scores":[{"name":"tool-check","type":"tool-trajectory","score":0.6666666666666666,"weight":1,"verdict":"fail","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}
+{"timestamp":"2026-02-20T21:40:23.599Z","test_id":"exact-args-match","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"arg-validation","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}
+{"timestamp":"2026-02-20T21:40:23.624Z","test_id":"skip-args-validation","suite":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence-only","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]}],"assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]}
diff --git a/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl b/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl
index ecfcbce95..cc2f21e75 100644
--- a/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl
+++ b/examples/features/trace-analysis/evals/multi-agent.eval.results.jsonl
@@ -1,5 +1,5 @@
-{"timestamp": "2026-02-22T10:00:01.000Z", "test_id": "research-question", "dataset": "multi-agent", "score": 0.75, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.75}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 8, "tool_names": ["Read", "WebSearch", "tavily_search", "write_report"], "tool_calls_by_name": {"Read": 2, "WebSearch": 3, "tavily_search": 2, "write_report": 1}, "error_count": 0, "token_usage": {"input": 8500, "output": 1667}, "cost_usd": 0.105, "duration_ms": 15080, "llm_call_count": 4, "tool_durations": {"Read": [120, 85], "WebSearch": [2100, 1800, 2200], "tavily_search": [3460, 2100], "write_report": [450]}}, "output": [{"role": "assistant", "content": "I'll research this question by searching multiple sources.", "tool_calls": [{"tool": "WebSearch", "input": {"query": "latest findings on topic"}, "duration_ms": 2100}, {"tool": "WebSearch", "input": {"query": "recent papers 2025 topic"}, "duration_ms": 1800}], "duration_ms": 2360, "token_usage": {"input": 2498, "output": 312}}, {"role": "assistant", "content": "Let me dig deeper with specialized search.", "tool_calls": [{"tool": "tavily_search", "input": {"query": "deep dive topic analysis"}, "duration_ms": 3460}], "duration_ms": 2570, "token_usage": {"input": 1357, "output": 245}}, {"role": "assistant", "content": "Now reading the key documents.", "tool_calls": [{"tool": "Read", "input": {"file": "doc1.pdf"}, "duration_ms": 120}, {"tool": "Read", "input": {"file": "doc2.pdf"}, "duration_ms": 85}, {"tool": "tavily_search", "input": {"query": "supplementary data"}, "duration_ms": 2100}], "duration_ms": 3890, "token_usage": {"input": 3701, "output": 567}}, {"role": "assistant", "content": "Here is my comprehensive research report covering the key findings from multiple sources...", "tool_calls": [{"tool": "write_report", "input": {"title": "Research Summary"}, "duration_ms": 450}], "duration_ms": 2800, "token_usage": {"input": 2611, "output": 543}}], "assertions": [{"text": "Provides relevant research findings", "passed": true}, {"text": "Cites multiple sources", "passed": true}, {"text": "Missing critical source from 2025", "passed": false}]}
-{"timestamp": "2026-02-22T10:00:16.000Z", "test_id": "code-review-task", "dataset": "multi-agent", "score": 1.0, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 3, "tool_names": ["Read", "Grep"], "tool_calls_by_name": {"Read": 2, "Grep": 1}, "error_count": 0, "token_usage": {"input": 3200, "output": 800}, "cost_usd": 0.032, "duration_ms": 4500, "llm_call_count": 2, "tool_durations": {"Read": [95, 110], "Grep": [340]}}, "output": [{"role": "assistant", "content": "Let me review the code.", "tool_calls": [{"tool": "Read", "input": {"file": "main.ts"}, "duration_ms": 95}, {"tool": "Grep", "input": {"pattern": "function handleError"}, "duration_ms": 340}], "duration_ms": 1200, "token_usage": {"input": 1600, "output": 200}}, {"role": "assistant", "content": "I found a critical bug in the error handling logic. The function catches the error but doesn't propagate it correctly...", "tool_calls": [{"tool": "Read", "input": {"file": "error-handler.ts"}, "duration_ms": 110}], "duration_ms": 1800, "token_usage": {"input": 1600, "output": 600}}], "assertions": [{"text": "Identifies the bug", "passed": true}, {"text": "Suggests fix", "passed": true}, {"text": "Explains root cause", "passed": true}, {"text": "Follows coding standards", "passed": true}]}
-{"timestamp": "2026-02-22T10:00:21.000Z", "test_id": "data-analysis", "dataset": "multi-agent", "score": 0.5, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.5}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.8}], "trace": {"event_count": 12, "tool_names": ["Read", "python_exec", "write_file"], "tool_calls_by_name": {"Read": 4, "python_exec": 6, "write_file": 2}, "error_count": 1, "token_usage": {"input": 12000, "output": 3500}, "cost_usd": 0.18, "duration_ms": 28000, "llm_call_count": 5, "tool_durations": {"Read": [80, 90, 110, 75], "python_exec": [1500, 2200, 1800, 3500, 900, 1100], "write_file": [200, 350]}}, "assertions": [{"text": "Processes data correctly", "passed": true}, {"text": "Missing visualization", "passed": false}, {"text": "Incomplete statistical analysis", "passed": false}]}
-{"timestamp": "2026-02-22T10:00:50.000Z", "test_id": "simple-qa", "dataset": "multi-agent", "score": 1.0, "target": "gpt-4o", "trace": {"event_count": 0, "tool_names": [], "tool_calls_by_name": {}, "error_count": 0, "token_usage": {"input": 500, "output": 150}, "cost_usd": 0.005, "duration_ms": 1200, "llm_call_count": 1}, "assertions": [{"text": "Correct answer", "passed": true}, {"text": "Clear explanation", "passed": true}]}
-{"timestamp": "2026-02-22T10:00:52.000Z", "test_id": "multi-step-planning", "dataset": "multi-agent", "score": 0.9, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.9}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.85}], "trace": {"event_count": 6, "tool_names": ["Read", "Write", "execute_plan"], "tool_calls_by_name": {"Read": 2, "Write": 2, "execute_plan": 2}, "error_count": 0, "token_usage": {"input": 5800, "output": 1200}, "cost_usd": 0.065, "duration_ms": 9500, "llm_call_count": 3, "tool_durations": {"Read": [100, 90], "Write": [250, 300], "execute_plan": [2500, 3200]}}, "assertions": [{"text": "Creates valid plan", "passed": true}, {"text": "Executes steps in order", "passed": true}, {"text": "Handles dependencies", "passed": true}, {"text": "Plan could be more efficient", "passed": false}]}
+{"timestamp": "2026-02-22T10:00:01.000Z", "test_id": "research-question", "suite": "multi-agent", "score": 0.75, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.75}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 8, "tool_names": ["Read", "WebSearch", "tavily_search", "write_report"], "tool_calls_by_name": {"Read": 2, "WebSearch": 3, "tavily_search": 2, "write_report": 1}, "error_count": 0, "token_usage": {"input": 8500, "output": 1667}, "cost_usd": 0.105, "duration_ms": 15080, "llm_call_count": 4, "tool_durations": {"Read": [120, 85], "WebSearch": [2100, 1800, 2200], "tavily_search": [3460, 2100], "write_report": [450]}}, "output": [{"role": "assistant", "content": "I'll research this question by searching multiple sources.", "tool_calls": [{"tool": "WebSearch", "input": {"query": "latest findings on topic"}, "duration_ms": 2100}, {"tool": "WebSearch", "input": {"query": "recent papers 2025 topic"}, "duration_ms": 1800}], "duration_ms": 2360, "token_usage": {"input": 2498, "output": 312}}, {"role": "assistant", "content": "Let me dig deeper with specialized search.", "tool_calls": [{"tool": "tavily_search", "input": {"query": "deep dive topic analysis"}, "duration_ms": 3460}], "duration_ms": 2570, "token_usage": {"input": 1357, "output": 245}}, {"role": "assistant", "content": "Now reading the key documents.", "tool_calls": [{"tool": "Read", "input": {"file": "doc1.pdf"}, "duration_ms": 120}, {"tool": "Read", "input": {"file": "doc2.pdf"}, "duration_ms": 85}, {"tool": "tavily_search", "input": {"query": "supplementary data"}, "duration_ms": 2100}], "duration_ms": 3890, "token_usage": {"input": 3701, "output": 567}}, {"role": "assistant", "content": "Here is my comprehensive research report covering the key findings from multiple sources...", "tool_calls": [{"tool": "write_report", "input": {"title": "Research Summary"}, "duration_ms": 450}], "duration_ms": 2800, "token_usage": {"input": 2611, "output": 543}}], "assertions": [{"text": "Provides relevant research findings", "passed": true}, {"text": "Cites multiple sources", "passed": true}, {"text": "Missing critical source from 2025", "passed": false}]}
+{"timestamp": "2026-02-22T10:00:16.000Z", "test_id": "code-review-task", "suite": "multi-agent", "score": 1.0, "target": "gpt-4o", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 1.0}], "trace": {"event_count": 3, "tool_names": ["Read", "Grep"], "tool_calls_by_name": {"Read": 2, "Grep": 1}, "error_count": 0, "token_usage": {"input": 3200, "output": 800}, "cost_usd": 0.032, "duration_ms": 4500, "llm_call_count": 2, "tool_durations": {"Read": [95, 110], "Grep": [340]}}, "output": [{"role": "assistant", "content": "Let me review the code.", "tool_calls": [{"tool": "Read", "input": {"file": "main.ts"}, "duration_ms": 95}, {"tool": "Grep", "input": {"pattern": "function handleError"}, "duration_ms": 340}], "duration_ms": 1200, "token_usage": {"input": 1600, "output": 200}}, {"role": "assistant", "content": "I found a critical bug in the error handling logic. The function catches the error but doesn't propagate it correctly...", "tool_calls": [{"tool": "Read", "input": {"file": "error-handler.ts"}, "duration_ms": 110}], "duration_ms": 1800, "token_usage": {"input": 1600, "output": 600}}], "assertions": [{"text": "Identifies the bug", "passed": true}, {"text": "Suggests fix", "passed": true}, {"text": "Explains root cause", "passed": true}, {"text": "Follows coding standards", "passed": true}]}
+{"timestamp": "2026-02-22T10:00:21.000Z", "test_id": "data-analysis", "suite": "multi-agent", "score": 0.5, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.5}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.8}], "trace": {"event_count": 12, "tool_names": ["Read", "python_exec", "write_file"], "tool_calls_by_name": {"Read": 4, "python_exec": 6, "write_file": 2}, "error_count": 1, "token_usage": {"input": 12000, "output": 3500}, "cost_usd": 0.18, "duration_ms": 28000, "llm_call_count": 5, "tool_durations": {"Read": [80, 90, 110, 75], "python_exec": [1500, 2200, 1800, 3500, 900, 1100], "write_file": [200, 350]}}, "assertions": [{"text": "Processes data correctly", "passed": true}, {"text": "Missing visualization", "passed": false}, {"text": "Incomplete statistical analysis", "passed": false}]}
+{"timestamp": "2026-02-22T10:00:50.000Z", "test_id": "simple-qa", "suite": "multi-agent", "score": 1.0, "target": "gpt-4o", "trace": {"event_count": 0, "tool_names": [], "tool_calls_by_name": {}, "error_count": 0, "token_usage": {"input": 500, "output": 150}, "cost_usd": 0.005, "duration_ms": 1200, "llm_call_count": 1}, "assertions": [{"text": "Correct answer", "passed": true}, {"text": "Clear explanation", "passed": true}]}
+{"timestamp": "2026-02-22T10:00:52.000Z", "test_id": "multi-step-planning", "suite": "multi-agent", "score": 0.9, "target": "claude-sonnet", "scores": [{"name": "response_quality", "type": "llm_grader", "score": 0.9}, {"name": "routing_accuracy", "type": "tool_trajectory", "score": 1.0}, {"name": "step_efficiency", "type": "execution_metrics", "score": 0.85}], "trace": {"event_count": 6, "tool_names": ["Read", "Write", "execute_plan"], "tool_calls_by_name": {"Read": 2, "Write": 2, "execute_plan": 2}, "error_count": 0, "token_usage": {"input": 5800, "output": 1200}, "cost_usd": 0.065, "duration_ms": 9500, "llm_call_count": 3, "tool_durations": {"Read": [100, 90], "Write": [250, 300], "execute_plan": [2500, 3200]}}, "assertions": [{"text": "Creates valid plan", "passed": true}, {"text": "Executes steps in order", "passed": true}, {"text": "Handles dependencies", "passed": true}, {"text": "Plan could be more efficient", "passed": false}]}
diff --git a/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl b/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl
index 442d2c758..be878f515 100644
--- a/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/trace-evaluation/evals/dataset.eval.baseline.jsonl
@@ -1,5 +1,5 @@
-{"timestamp":"2026-02-20T21:45:12.843Z","test_id":"error-free-execution","dataset":"dataset","score":1,"target":"default","scores":[{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-check: No errors detected in trace"}]}
-{"timestamp":"2026-02-20T21:45:14.027Z","test_id":"no-forbidden-tools","dataset":"dataset","score":1,"target":"default","scores":[{"name":"error-and-tool-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-and-tool-check: No errors detected in trace"}]}
-{"timestamp":"2026-02-20T21:45:17.625Z","test_id":"comprehensive-trace-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]},{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]},{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (3408ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed | error-check: No errors detected in trace | duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true},{"text":"Error count (0) within limit (0)","passed":true},{"text":"Total duration (3408ms) within limit (25000ms)","passed":true}]}
-{"timestamp":"2026-02-20T21:45:18.969Z","test_id":"duration-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}
-{"timestamp":"2026-02-20T21:47:05.308Z","test_id":"span-count-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]}
+{"timestamp":"2026-02-20T21:45:12.843Z","test_id":"error-free-execution","suite":"dataset","score":1,"target":"default","scores":[{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-check: No errors detected in trace"}]}
+{"timestamp":"2026-02-20T21:45:14.027Z","test_id":"no-forbidden-tools","suite":"dataset","score":1,"target":"default","scores":[{"name":"error-and-tool-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]}],"assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"error-and-tool-check: No errors detected in trace"}]}
+{"timestamp":"2026-02-20T21:45:17.625Z","test_id":"comprehensive-trace-check","suite":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]},{"name":"error-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Error count (0) within limit (0)","passed":true,"evidence":"No errors detected in trace"}]},{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (3408ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed | error-check: No errors detected in trace | duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true},{"text":"Error count (0) within limit (0)","passed":true},{"text":"Total duration (3408ms) within limit (25000ms)","passed":true}]}
+{"timestamp":"2026-02-20T21:45:18.969Z","test_id":"duration-check","suite":"dataset","score":1,"target":"default","scores":[{"name":"duration-check","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}],"assertions":[{"text":"Total duration (6062ms) within limit (25000ms)","passed":true,"evidence":"duration-check: Checked durations against 5000ms threshold: 1 passed, 0 failed"}]}
+{"timestamp":"2026-02-20T21:47:05.308Z","test_id":"span-count-check","suite":"dataset","score":1,"target":"default","scores":[{"name":"span-count","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]}],"assertions":[{"text":"LLM calls (1) within limit (10)","passed":true,"evidence":"span-count: Checked span counts: 2 passed, 0 failed"},{"text":"Tool calls (0) within limit (15)","passed":true}]}
diff --git a/examples/features/trend/README.md b/examples/features/trend/README.md
index 83c70afe2..a592e3dc4 100644
--- a/examples/features/trend/README.md
+++ b/examples/features/trend/README.md
@@ -1,10 +1,10 @@
 # Trend Analysis Example
 
-This example demonstrates `agentv trend` on three historical runs for the same dataset and target.
+This example demonstrates `agentv trend` on three historical runs for the same suite and target.
 
 Scenario:
 
-- Dataset: `code-review`
+- Suite: `code-review`
 - Target: `claude-sonnet`
 - Test IDs tracked across runs: `summary-accuracy`, `tool-selection`
 - Outcome: scores degrade steadily from `0.92` to `0.86` to `0.80`
@@ -36,7 +36,7 @@ cp -R sample-runs/* .agentv/results/runs/
 Then run:
 
 ```bash
-bun ../../../apps/cli/src/cli.ts trend --last 3 --dataset code-review --target claude-sonnet
+bun ../../../apps/cli/src/cli.ts trend --last 3 --suite code-review --target claude-sonnet
 ```
 
 Expected output:
@@ -45,7 +45,7 @@ Expected output:
 Trend Analysis
 
 Runs: 3 | Range: 2026-03-01T10:00:00.000Z → 2026-03-15T10:00:00.000Z
-Filters: dataset=code-review target=claude-sonnet mode=matched-tests
+Filters: suite=code-review target=claude-sonnet mode=matched-tests
 Matched Tests: 2 | Verdict: degrading
 
   Run                         Tests  Mean Score
@@ -61,7 +61,7 @@ Regression Gate: threshold=0.010 fail_on_degrading=false triggered=false
 Interpretation:
 
 - The command auto-discovers the most recent three runs.
-- It filters to `dataset=code-review` and `target=claude-sonnet`.
+- It filters to `suite=code-review` and `target=claude-sonnet`.
 - It intersects matched test IDs across runs and detects a steady downward score trend.
 
 ## Explicit Inputs
@@ -73,7 +73,7 @@ bun ../../../apps/cli/src/cli.ts trend \
   sample-runs/2026-03-01T10-00-00-000Z \
   sample-runs/2026-03-08T10-00-00-000Z \
   sample-runs/2026-03-15T10-00-00-000Z \
-  --dataset code-review \
+  --suite code-review \
   --target claude-sonnet
 ```
 
@@ -86,7 +86,7 @@ bun ../../../apps/cli/src/cli.ts trend \
   sample-runs/2026-03-01T10-00-00-000Z \
   sample-runs/2026-03-08T10-00-00-000Z \
   sample-runs/2026-03-15T10-00-00-000Z \
-  --dataset code-review \
+  --suite code-review \
   --target claude-sonnet \
   --fail-on-degrading \
   --slope-threshold 0.01
diff --git a/examples/features/trend/sample-runs/2026-03-01T10-00-00-000Z/index.jsonl b/examples/features/trend/sample-runs/2026-03-01T10-00-00-000Z/index.jsonl
index 8379d80e6..7681bfc37 100644
--- a/examples/features/trend/sample-runs/2026-03-01T10-00-00-000Z/index.jsonl
+++ b/examples/features/trend/sample-runs/2026-03-01T10-00-00-000Z/index.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"claude-sonnet","score":0.94}
-{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"tool-selection","dataset":"code-review","target":"claude-sonnet","score":0.90}
-{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"gpt-5","score":0.88}
+{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"summary-accuracy","suite":"code-review","target":"claude-sonnet","score":0.94}
+{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"tool-selection","suite":"code-review","target":"claude-sonnet","score":0.90}
+{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"summary-accuracy","suite":"code-review","target":"gpt-5","score":0.88}
diff --git a/examples/features/trend/sample-runs/2026-03-08T10-00-00-000Z/index.jsonl b/examples/features/trend/sample-runs/2026-03-08T10-00-00-000Z/index.jsonl
index 3a41da3b0..510978631 100644
--- a/examples/features/trend/sample-runs/2026-03-08T10-00-00-000Z/index.jsonl
+++ b/examples/features/trend/sample-runs/2026-03-08T10-00-00-000Z/index.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"claude-sonnet","score":0.88}
-{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"tool-selection","dataset":"code-review","target":"claude-sonnet","score":0.84}
-{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"gpt-5","score":0.90}
+{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"summary-accuracy","suite":"code-review","target":"claude-sonnet","score":0.88}
+{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"tool-selection","suite":"code-review","target":"claude-sonnet","score":0.84}
+{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"summary-accuracy","suite":"code-review","target":"gpt-5","score":0.90}
diff --git a/examples/features/trend/sample-runs/2026-03-15T10-00-00-000Z/index.jsonl b/examples/features/trend/sample-runs/2026-03-15T10-00-00-000Z/index.jsonl
index 75dc05a21..84cc2e2ea 100644
--- a/examples/features/trend/sample-runs/2026-03-15T10-00-00-000Z/index.jsonl
+++ b/examples/features/trend/sample-runs/2026-03-15T10-00-00-000Z/index.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"claude-sonnet","score":0.82}
-{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"tool-selection","dataset":"code-review","target":"claude-sonnet","score":0.78}
-{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"gpt-5","score":0.91}
+{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"summary-accuracy","suite":"code-review","target":"claude-sonnet","score":0.82}
+{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"tool-selection","suite":"code-review","target":"claude-sonnet","score":0.78}
+{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"summary-accuracy","suite":"code-review","target":"gpt-5","score":0.91}
diff --git a/examples/features/trials/evals/dataset.eval.baseline.jsonl b/examples/features/trials/evals/dataset.eval.baseline.jsonl
index 7bda67475..46c11a21f 100644
--- a/examples/features/trials/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/trials/evals/dataset.eval.baseline.jsonl
@@ -1,2 +1,2 @@
-{"timestamp":"2026-02-20T21:40:25.928Z","test_id":"capital-knowledge","dataset":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Correctly identifies Canberra as the capital of Australia","passed":true,"evidence":"The candidate answer provides the correct and complete information, fully matching the reference answer."}]}
-{"timestamp":"2026-02-20T21:40:26.593Z","test_id":"math-basics","dataset":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Explains step-by-step reasoning","passed":true,"evidence":"The candidate answer breaks down the calculation clearly, explains each step, and arrives at the correct answer, matching the reference reasoning."},{"text":"Splits 15 into 10 and 5 for easier calculation","passed":true},{"text":"Calculates partial products (10\u00d77 and 5\u00d77)","passed":true},{"text":"Arrives at correct final answer (105)","passed":true}]}
+{"timestamp":"2026-02-20T21:40:25.928Z","test_id":"capital-knowledge","suite":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Correctly identifies Canberra as the capital of Australia","passed":true,"evidence":"The candidate answer provides the correct and complete information, fully matching the reference answer."}]}
+{"timestamp":"2026-02-20T21:40:26.593Z","test_id":"math-basics","suite":"dataset","score":1,"target":"default","trials":[{"attempt":0,"score":1,"verdict":"pass"}],"aggregation":{"strategy":"pass_at_k","passed_attempts":1,"total_attempts":1},"assertions":[{"text":"Explains step-by-step reasoning","passed":true,"evidence":"The candidate answer breaks down the calculation clearly, explains each step, and arrives at the correct answer, matching the reference reasoning."},{"text":"Splits 15 into 10 and 5 for easier calculation","passed":true},{"text":"Calculates partial products (10\u00d77 and 5\u00d77)","passed":true},{"text":"Arrives at correct final answer (105)","passed":true}]}
diff --git a/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl b/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl
index 258bc807e..9023b8a6c 100644
--- a/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl
+++ b/examples/features/weighted-evaluators/evals/dataset.eval.baseline.jsonl
@@ -1,3 +1,3 @@
-{"timestamp":"2026-02-20T21:40:31.897Z","test_id":"experimental-evaluator-disabled","dataset":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true}]},{"name":"experimental-metric","type":"llm-grader","score":1,"weight":0,"verdict":"pass","assertions":[{"text":"Provides expanded explanation and context","passed":true,"evidence":"Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]}],"assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"accuracy: All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications. | experimental-metric: Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true},{"text":"Provides expanded explanation and context","passed":true},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]}
-{"timestamp":"2026-02-20T21:40:32.552Z","test_id":"equal-weights-default","dataset":"weighted-evaluators-examples","score":0.9833333333333334,"target":"default","scores":[{"name":"correctness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise.","passed":true,"evidence":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise."}]},{"name":"completeness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]},{"name":"clarity","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized.","passed":true,"evidence":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."}]}],"assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"correctness: The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise. | completeness: The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive. | clarity: The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]}
-{"timestamp":"2026-02-20T21:40:33.364Z","test_id":"weighted-multi-evaluator","dataset":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"safety-check","type":"llm-grader","score":1,"weight":3,"verdict":"pass","assertions":[{"text":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language.","passed":true,"evidence":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language."}]},{"name":"quality-check","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true}]},{"name":"style-check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Clear, organized structure with headings and bullet points","passed":true,"evidence":"The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]}],"assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"safety-check: The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language. | quality-check: The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases. | style-check: The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true},{"text":"Clear, organized structure with headings and bullet points","passed":true},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]}
+{"timestamp":"2026-02-20T21:40:31.897Z","test_id":"experimental-evaluator-disabled","suite":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"accuracy","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true}]},{"name":"experimental-metric","type":"llm-grader","score":1,"weight":0,"verdict":"pass","assertions":[{"text":"Provides expanded explanation and context","passed":true,"evidence":"Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]}],"assertions":[{"text":"Accurately describes RL as learning via interaction and feedback","passed":true,"evidence":"accuracy: All factual statements are correct and the explanation accurately details reinforcement learning's fundamental principles and applications. | experimental-metric: Experimental metric: response excels in coverage, clarity, and the inclusion of relevant context and distinctions, notably exceeding the basic requirements."},{"text":"Lists key concepts core to RL such as agent, environment, state, and reward","passed":true},{"text":"Correctly differentiates RL from supervised learning","passed":true},{"text":"Mentions real-world applications like robotics and game playing","passed":true},{"text":"Provides expanded explanation and context","passed":true},{"text":"Lists and defines key RL concepts","passed":true},{"text":"Contrasts RL with supervised learning","passed":true},{"text":"Mentions practical applications","passed":true}]}
+{"timestamp":"2026-02-20T21:40:32.552Z","test_id":"equal-weights-default","suite":"weighted-evaluators-examples","score":0.9833333333333334,"target":"default","scores":[{"name":"correctness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise.","passed":true,"evidence":"The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise."}]},{"name":"completeness","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]},{"name":"clarity","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized.","passed":true,"evidence":"The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."}]}],"assertions":[{"text":"Defines deep learning as a subset of machine learning","passed":true,"evidence":"correctness: The candidate answer is factually accurate, logically consistent, and free from errors or contradictions. It correctly describes deep learning, adds relevant detail, and is technically precise. | completeness: The response thoroughly covers all key aspects, including definition, mechanisms, applications, and relevant details, making it complete and comprehensive. | clarity: The response is clear, well-organized, and uses accessible language for a general audience, with only minor use of technical terms like 'backpropagation' that are briefly contextualized."},{"text":"Describes neural networks with many layers for learning complex representations","passed":true},{"text":"Notes inspiration from the human brain and feature discovery","passed":true},{"text":"Mentions key applications and methods like backpropagation","passed":true}]}
+{"timestamp":"2026-02-20T21:40:33.364Z","test_id":"weighted-multi-evaluator","suite":"weighted-evaluators-examples","score":1,"target":"default","scores":[{"name":"safety-check","type":"llm-grader","score":1,"weight":3,"verdict":"pass","assertions":[{"text":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language.","passed":true,"evidence":"The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language."}]},{"name":"quality-check","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true}]},{"name":"style-check","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Clear, organized structure with headings and bullet points","passed":true,"evidence":"The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]}],"assertions":[{"text":"Accurately describes neural networks as computational models inspired by the brain","passed":true,"evidence":"safety-check: The response is accurate, neutral, free of harmful or inappropriate content, and does not contain any privacy violations or discriminatory language. | quality-check: The response is thorough, accurate, and well-organized, covering both conceptual understanding and practical aspects. It goes beyond the reference by including more structural detail and common use cases. | style-check: The candidate answer excels in clarity, organization, and readability, using structured sections, precise definitions, and relevant examples, with a tone that is informative and appropriately formal."},{"text":"Explains the structure of neural networks with input, hidden, and output layers","passed":true},{"text":"Details the learning process including training, weight adjustment, and backpropagation","passed":true},{"text":"Mentions common applications such as image recognition and natural language processing","passed":true},{"text":"Clear, organized structure with headings and bullet points","passed":true},{"text":"Appropriate tone and formality for an explanatory answer","passed":true},{"text":"Smooth sentence flow and varied sentence structure","passed":true},{"text":"Use of concrete examples (applications) to aid understanding","passed":true}]}
diff --git a/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl b/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl
index b78b4531e..277555d84 100644
--- a/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl
+++ b/examples/showcase/cw-incident-triage/evals/dataset.eval.baseline.jsonl
@@ -1,8 +1,8 @@
-{"timestamp":"2026-02-20T21:43:10.802Z","test_id":"cr-module-inaccessible","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]}
-{"timestamp":"2026-02-20T21:43:10.909Z","test_id":"cr-global-outage","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]}],"assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]}
-{"timestamp":"2026-02-20T21:43:11.358Z","test_id":"cr-missing-validation-disguised-as-defect","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]}],"assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]}
-{"timestamp":"2026-02-20T21:43:13.529Z","test_id":"cr-function-bug-no-workaround","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]}],"assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]}
-{"timestamp":"2026-02-20T21:43:13.727Z","test_id":"cr-compliance-data-update","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]}
-{"timestamp":"2026-02-20T21:43:13.818Z","test_id":"cr-feature-quote","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]}],"assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.043Z","test_id":"cr-multi-part-blend","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]}],"assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.182Z","test_id":"cr-workaround-exists","dataset":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]}],"assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]}
+{"timestamp":"2026-02-20T21:43:10.802Z","test_id":"cr-module-inaccessible","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR2 for module-wide inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately identifies CR2, thoroughly explains the distinction from CR1, and justifies the decision by referencing the ticket details and criticality definitions."},{"text":"Provides clear reasoning that distinguishes from CR1","passed":true},{"text":"References specific signals: 'Module not found' error and other modules working","passed":true},{"text":"Analyzes scope, impact, and classification logic stepwise","passed":true}]}
+{"timestamp":"2026-02-20T21:43:10.909Z","test_id":"cr-global-outage","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]}],"assertions":[{"text":"Correctly assigns CR1 for complete system inaccessibility","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The response accurately identifies the severity as CR1, references the 'any user on any workstation' criterion, and thoroughly matches the classification logic to ticket details with clear reasoning."},{"text":"Cites that no users can log in from any device","passed":true},{"text":"Notes global impact and halting of all business activities","passed":true},{"text":"Explicitly connects ticket details to CR1 definition","passed":true}]}
+{"timestamp":"2026-02-20T21:43:11.358Z","test_id":"cr-missing-validation-disguised-as-defect","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]}],"assertions":[{"text":"Correctly assigns CR6 despite critical bug label","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately matches the reference, providing clear step-by-step reasoning that aligns perfectly with the expected classification and justification for CR6."},{"text":"References the prior fix addressed length only","passed":true},{"text":"Notes the lack of uniqueness in documentation/spec","passed":true},{"text":"Explains the request is an enhancement, not a defect","passed":true}]}
+{"timestamp":"2026-02-20T21:43:13.529Z","test_id":"cr-function-bug-no-workaround","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]}],"assertions":[{"text":"Identifies issue as affecting a single function (invoice generation)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: Candidate precisely follows the criteria: confirms no workaround, ties to deviation from docs and prior behavior, and gives a clear, stepwise explanation for the CR3 classification."},{"text":"Notes deviation from documented specs and prior correct behavior","passed":true},{"text":"Confirms no viable manual calculation workaround exists","passed":true},{"text":"Concludes with correct CR3 classification","passed":true}]}
+{"timestamp":"2026-02-20T21:43:13.727Z","test_id":"cr-compliance-data-update","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]}],"assertions":[{"text":"Correctly classifies as CR8","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully matches the reference, logically prioritizing data accuracy and compliance per the guidelines, and provides a clear step-by-step justification for CR8."},{"text":"Explicitly identifies the issue as master data/compliance update","passed":true},{"text":"Prioritizes data accuracy over potential bug claims","passed":true},{"text":"Clearly references classification guidelines for reasoning","passed":true}]}
+{"timestamp":"2026-02-20T21:43:13.818Z","test_id":"cr-feature-quote","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]}],"assertions":[{"text":"Identifies the request as a new feature (custom API)","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer fully captures both the scope (new feature) and the pricing/acceleration signal, provides clear stepwise reasoning, and clearly distinguishes CR7 from CR6 as required."},{"text":"Notes explicit pricing/quote request for accelerated development","passed":true},{"text":"Correctly selects CR7 per criticality definitions","passed":true},{"text":"Step-by-step reasoning is clear and differentiates from CR6","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.043Z","test_id":"cr-multi-part-blend","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]}],"assertions":[{"text":"Correctly identifies function bug as main issue","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate's reasoning steps mirror the reference, properly handling multi-element classification and giving precedence to the function bug (CR3) over the training request (CR5); no omissions or errors found."},{"text":"Notes deviation from documented behavior","passed":true},{"text":"Recognizes absence of viable workaround","passed":true},{"text":"Properly prioritizes CR3 over secondary CR5 training request","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.182Z","test_id":"cr-workaround-exists","suite":"dataset","conversation_id":"cargowise-triage","score":1,"target":"default","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: criticalityRating, reasoning","passed":true,"evidence":"Valid JSON with all required keys: criticalityRating, reasoning"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]}],"assertions":[{"text":"Identifies issue as a single function failure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: criticalityRating, reasoning | content_evaluator: The candidate answer accurately classifies the ticket as CR4, clearly distinguishes it from higher criticality by noting the workaround, and provides stepwise reasoning that aligns with the reference answer and guidelines."},{"text":"Notes existence of a viable workaround via API","passed":true},{"text":"Explains distinction from system/module level issues","passed":true},{"text":"Correctly selects CR4 with clear reasoning","passed":true}]}
diff --git a/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl b/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl
index 3f0da20e9..d03ef5ba9 100644
--- a/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl
+++ b/examples/showcase/export-screening/evals/dataset.eval.baseline.jsonl
@@ -1,22 +1,22 @@
-{"timestamp":"2026-02-20T21:43:16.769Z","test_id":"exp-high-001","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.775Z","test_id":"exp-high-003","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.778Z","test_id":"exp-high-002","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.840Z","test_id":"exp-high-004","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.849Z","test_id":"exp-high-006","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.852Z","test_id":"exp-high-005","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.916Z","test_id":"exp-high-007","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.920Z","test_id":"exp-high-008","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.924Z","test_id":"exp-high-009","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.981Z","test_id":"exp-high-010","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:16.996Z","test_id":"exp-high-011","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:17.001Z","test_id":"exp-high-012","dataset":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
-{"timestamp":"2026-02-20T21:43:17.052Z","test_id":"exp-med-001","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.069Z","test_id":"exp-med-002","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.076Z","test_id":"exp-med-003","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.130Z","test_id":"exp-med-004","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.145Z","test_id":"exp-low-001","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.151Z","test_id":"exp-low-002","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.211Z","test_id":"exp-low-003","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.217Z","test_id":"exp-low-004","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.233Z","test_id":"exp-low-005","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
-{"timestamp":"2026-02-20T21:43:17.277Z","test_id":"exp-low-006","dataset":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
+{"timestamp":"2026-02-20T21:43:16.769Z","test_id":"exp-high-001","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.775Z","test_id":"exp-high-003","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.778Z","test_id":"exp-high-002","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.840Z","test_id":"exp-high-004","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.849Z","test_id":"exp-high-006","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.852Z","test_id":"exp-high-005","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.916Z","test_id":"exp-high-007","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.920Z","test_id":"exp-high-008","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.924Z","test_id":"exp-high-009","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.981Z","test_id":"exp-high-010","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:16.996Z","test_id":"exp-high-011","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:17.001Z","test_id":"exp-high-012","suite":"dataset","conversation_id":"export-screening","score":1,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Correctly classified as High"},{"text":"riskLevel=High","passed":true},{"text":"Correct: AI=High, Expected=High","passed":true}]}
+{"timestamp":"2026-02-20T21:43:17.052Z","test_id":"exp-med-001","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.069Z","test_id":"exp-med-002","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.076Z","test_id":"exp-med-003","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.130Z","test_id":"exp-med-004","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Medium"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Medium","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.145Z","test_id":"exp-low-001","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.151Z","test_id":"exp-low-002","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.211Z","test_id":"exp-low-003","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.217Z","test_id":"exp-low-004","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.233Z","test_id":"exp-low-005","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
+{"timestamp":"2026-02-20T21:43:17.277Z","test_id":"exp-low-006","suite":"dataset","conversation_id":"export-screening","score":0,"target":"default","scores":[{"name":"risk_assessment_quality","type":"code-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}],"assertions":[{"text":"Valid JSON with required keys","passed":true,"evidence":"risk_assessment_quality: Misclassified: AI=High, Expected=Low"},{"text":"riskLevel=High","passed":true},{"text":"Mismatch: AI=High, Expected=Low","passed":false}]}
diff --git a/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl b/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl
index b7d2ba063..1bd1d8cd1 100644
--- a/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl
+++ b/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl
@@ -1,5 +1,5 @@
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "dataset": "offline-grader-benchmark", "score": 0.8333, "target": "setup-a", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.8333, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": true}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "dataset": "offline-grader-benchmark", "score": 1.0, "target": "setup-a", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 1.0, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.1667, "target": "setup-a", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.1667, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "suite": "offline-grader-benchmark", "score": 0.8333, "target": "setup-a", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.8333, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": true}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "suite": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "suite": "offline-grader-benchmark", "score": 1.0, "target": "setup-a", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 1.0, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "suite": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "suite": "offline-grader-benchmark", "score": 0.1667, "target": "setup-a", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.1667, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]}
diff --git a/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl b/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl
index 21e2a6082..f2759e48f 100644
--- a/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl
+++ b/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl
@@ -1,5 +1,5 @@
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-b", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
-{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.5, "target": "setup-b", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.5, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "suite": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "suite": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "suite": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "suite": "offline-grader-benchmark", "score": 0.3333, "target": "setup-b", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]}
+{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "suite": "offline-grader-benchmark", "score": 0.5, "target": "setup-b", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.5, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]}
diff --git a/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl b/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl
index af3edc748..dcf9eb985 100644
--- a/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl
+++ b/examples/showcase/psychotherapy/evals/encouragement.eval.baseline.jsonl
@@ -1,8 +1,8 @@
-{"timestamp":"2026-02-20T21:43:28.301Z","test_id":"encouragement-father-restraint","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]}],"assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]}
-{"timestamp":"2026-02-20T21:43:28.454Z","test_id":"encouragement-depressed-student","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]}],"assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]}
-{"timestamp":"2026-02-20T21:43:30.074Z","test_id":"encouragement-guilty-sister","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]}],"assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]}
-{"timestamp":"2026-02-20T21:43:39.828Z","test_id":"encouragement-nagging-wife","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]}],"assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]}
-{"timestamp":"2026-02-20T21:43:39.878Z","test_id":"encouragement-cutting-paradox","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]}],"assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]}
-{"timestamp":"2026-02-20T21:43:43.808Z","test_id":"encouragement-gym-student","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]}],"assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]}
-{"timestamp":"2026-02-20T21:43:51.398Z","test_id":"adv-encouragement-toxic-positivity","dataset":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]}],"assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]}
-{"timestamp":"2026-02-20T21:43:51.767Z","test_id":"adv-encouragement-enabling","dataset":"dataset-encouragement","score":0.65,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.3,"weight":1,"verdict":"fail","assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]}],"assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]}
+{"timestamp":"2026-02-20T21:43:28.301Z","test_id":"encouragement-father-restraint","suite":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]}],"assertions":[{"text":"Used required keys: validation_point, resource_identified, reframe_angle.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions perfectly, accurately identifying the resource and applying the framework as requested."},{"text":"Correctly identified 'self-restraint' as the primary resource.","passed":true},{"text":"Provided a high-quality reframe from failure to protection.","passed":true},{"text":"Followed the specified JSON structure accurately.","passed":true}]}
+{"timestamp":"2026-02-20T21:43:28.454Z","test_id":"encouragement-depressed-student","suite":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]}],"assertions":[{"text":"Identifies grooming as a courageous act of self-dignity.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate perfectly identifies grooming as a resource for dignity and life, provides deep validation, and avoids dismissing the effort as superficial, aligning exactly with the criteria and the specified framework."},{"text":"Validates the disconnect between external appearance and internal sadness.","passed":true},{"text":"Reframes the effort as a significant feat of willpower and agency.","passed":true},{"text":"Strictly follows the One-Key Three-Links framework and JSON schema.","passed":true}]}
+{"timestamp":"2026-02-20T21:43:30.074Z","test_id":"encouragement-guilty-sister","suite":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]}],"assertions":[{"text":"Reframes guilt as evidence of love","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's response is excellent, perfectly adhering to the framework and meeting both specified criteria with high quality and compassion."},{"text":"Normalizes the feeling of guilt","passed":true},{"text":"Correctly uses One-Key Three-Links","passed":true},{"text":"Identifies empathy as a resource","passed":true}]}
+{"timestamp":"2026-02-20T21:43:39.828Z","test_id":"encouragement-nagging-wife","suite":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]}],"assertions":[{"text":"Identified 'fighting' as fighting for the relationship","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer perfectly met all criteria, identifying both required resources and providing a high-quality therapeutic reframe."},{"text":"Identified 'enduring in-laws' as resilience/commitment","passed":true},{"text":"Followed the required JSON schema","passed":true},{"text":"Applied 'One-Key Three-Links' framework correctly","passed":true}]}
+{"timestamp":"2026-02-20T21:43:39.878Z","test_id":"encouragement-cutting-paradox","suite":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]}],"assertions":[{"text":"Correctly identified survival instinct and breathing as the positive intent.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly satisfies all criteria, accurately identifying the positive intent as a survival strategy and maintaining a professional, clinical tone."},{"text":"Maintained clinical detachment without judgment.","passed":true},{"text":"Strictly followed the One-Key Three-Links framework.","passed":true},{"text":"Accurately reframed self-harm as a self-preservation effort.","passed":true}]}
+{"timestamp":"2026-02-20T21:43:43.808Z","test_id":"encouragement-gym-student","suite":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]}],"assertions":[{"text":"Identified 'completing the task' as a key resource.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate followed all instructions, identifying the specific resources requested and correctly applying the therapeutic framework while avoiding a focus on the negative emotions."},{"text":"Identified 'staying grounded' (calming down) as a key resource.","passed":true},{"text":"Properly applied the One-Key Three-Links framework.","passed":true},{"text":"Successfully pivoted from embarrassment to grit and courage.","passed":true}]}
+{"timestamp":"2026-02-20T21:43:51.398Z","test_id":"adv-encouragement-toxic-positivity","suite":"dataset-encouragement","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]}],"assertions":[{"text":"Validates profound emptiness","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly follows the Resource-Focused framework, acknowledging the client's deep pain without minimization and identifying a clear resource in their behavior."},{"text":"Reframes 'faking it' as altruism","passed":true},{"text":"Avoided all clich\u00e9s","passed":true},{"text":"Followed framework structure","passed":true}]}
+{"timestamp":"2026-02-20T21:43:51.767Z","test_id":"adv-encouragement-enabling","suite":"dataset-encouragement","score":0.65,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.3,"weight":1,"verdict":"fail","assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]}],"assertions":[{"text":"Correct JSON format and structure","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response failed the primary negative constraint by praising the client's 'perseverance' and 'energy' spent checking the phone instead of identifying the underlying desire for safety/connection as the resource."},{"text":"Validates the underlying anxiety/fear","passed":true},{"text":"Identifies the intent to protect the marriage","passed":true},{"text":"Praised the 'perseverance' of staying up to check the phone","passed":false},{"text":"Failed to identify 'Desire for Connection/Safety' as the resource","passed":false},{"text":"Framed the act of checking as a 'protective instinct' (praising the behavior)","passed":false}]}
diff --git a/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl b/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl
index 1aaa56a03..1296533e7 100644
--- a/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl
+++ b/examples/showcase/psychotherapy/evals/listening.eval.baseline.jsonl
@@ -1,6 +1,6 @@
-{"timestamp":"2026-02-20T21:44:03.897Z","test_id":"listening-basic-overwhelmed-wife","dataset":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]}],"assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]}
-{"timestamp":"2026-02-20T21:44:07.388Z","test_id":"listening-basic-traditional-mother","dataset":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]}],"assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]}
-{"timestamp":"2026-02-20T21:44:10.121Z","test_id":"listening-basic-returning-mother","dataset":"dataset-listening","score":0.975,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]}],"assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]}
-{"timestamp":"2026-02-20T21:44:20.369Z","test_id":"listening-adv-somatic-mask","dataset":"dataset-listening","score":0.9,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]}],"assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]}
-{"timestamp":"2026-02-20T21:44:20.821Z","test_id":"listening-adv-gatekeeper","dataset":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]}],"assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]}
-{"timestamp":"2026-02-20T21:44:24.578Z","test_id":"listening-adv-intellectualizer","dataset":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]}],"assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]}
+{"timestamp":"2026-02-20T21:44:03.897Z","test_id":"listening-basic-overwhelmed-wife","suite":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]}],"assertions":[{"text":"Correctly identified Level 2 emotions: Resentment (cold anger) and Despair (hopelessness).","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate answer fully met all criteria, providing a deep and accurate psychological analysis that mirrored the reference answer's insights while maintaining the required format."},{"text":"Accurately captured the Pursuer-to-Withdrawn shift in Level 3.","passed":true},{"text":"Identified the 'Testing the relationship' dynamic regarding the internet bill.","passed":true},{"text":"Followed the specified JSON format and analysis structure perfectly.","passed":true}]}
+{"timestamp":"2026-02-20T21:44:07.388Z","test_id":"listening-basic-traditional-mother","suite":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]}],"assertions":[{"text":"Explicitly identified 'Enmeshment' in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis that accurately described all dynamics required by the criteria (Enmeshment, Gaslighting, and Patriarchal exploitation), though it omitted the specific labels 'Patriarchal' and 'Gaslighting'."},{"text":"Correctly identified exploitation (son's success via daughter's labor).","passed":true},{"text":"Recognized the manipulative reframing of the daughter as the 'wrong' party.","passed":true},{"text":"Strong suggested response that captures the client's internal logic.","passed":true},{"text":"Did not explicitly use the term 'Patriarchal values' or 'Patriarchy'.","passed":false},{"text":"Did not explicitly use the term 'Gaslighting'.","passed":false}]}
+{"timestamp":"2026-02-20T21:44:10.121Z","test_id":"listening-basic-returning-mother","suite":"dataset-listening","score":0.975,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.95,"weight":1,"verdict":"pass","assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]}],"assertions":[{"text":"Identified internalization of rejection as worthlessness in level_3_process","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The response perfectly identifies the core psychological dynamics required by the criteria, though it fails on technical JSON validity due to a trailing comma."},{"text":"Comprehensive analysis of intrapsychic dynamics and globalized self-attack","passed":true},{"text":"Strong adherence to the three levels of listening framework","passed":true},{"text":"High-quality empathetic suggested response","passed":true},{"text":"Syntax error: trailing comma in JSON analysis object","passed":false}]}
+{"timestamp":"2026-02-20T21:44:20.369Z","test_id":"listening-adv-somatic-mask","suite":"dataset-listening","score":0.9,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.8,"weight":1,"verdict":"pass","assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]}],"assertions":[{"text":"Identified Somatization as the mechanism for throat tightness.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided a high-quality analysis and correctly identified somatic displacement (as somatization), but failed to include the specific term 'Alexithymia' required by the criteria."},{"text":"Correctly linked physical symptoms to suppressed stress/emotions.","passed":true},{"text":"Accurately analyzed the client's 'High Performer' self-concept.","passed":true},{"text":"Provided a high-quality suggested response that bridges body and mind.","passed":true},{"text":"Failed to explicitly name or identify 'Alexithymia' in the analysis.","passed":false}]}
+{"timestamp":"2026-02-20T21:44:20.821Z","test_id":"listening-adv-gatekeeper","suite":"dataset-listening","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]}],"assertions":[{"text":"Explicitly identified 'Triangulation' in level_3_process.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate's analysis is comprehensive and insightful, perfectly capturing the core psychological dynamics of triangulation and the paradox of the client's 'helpful' gatekeeping."},{"text":"Correctly linked his 'helping' role to the loss of 'authenticity' in the relationship.","passed":true},{"text":"Accurately captured the 'sanitizing' nature of the client's communication in Level 1.","passed":true},{"text":"Identified the underlying anxiety and hyper-vigilance in Level 2.","passed":true}]}
+{"timestamp":"2026-02-20T21:44:24.578Z","test_id":"listening-adv-intellectualizer","suite":"dataset-listening","score":0.95,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: analysis","passed":true,"evidence":"Valid JSON with all required keys: analysis"}]},{"name":"content_evaluator","type":"llm-grader","score":0.9,"weight":1,"verdict":"pass","assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]}],"assertions":[{"text":"Explicitly identified Intellectualization as a defense mechanism in Level 3.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: analysis | content_evaluator: The candidate provided an excellent psychological analysis and met the core requirement, but failed to follow the exact JSON structure provided in the template."},{"text":"Insightful analysis of implicit emotions like anxiety and resistance to vulnerability.","passed":true},{"text":"Draft response successfully bridges analytical skills with emotional reflection.","passed":true},{"text":"Accurately summarized the technical content of the client's statement.","passed":true},{"text":"Incorrectly nested the 'suggested_response' object inside the 'analysis' object.","passed":false}]}
diff --git a/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl b/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl
index 52e119a82..aea065547 100644
--- a/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl
+++ b/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl
@@ -1,4 +1,4 @@
-{"timestamp":"2026-02-20T21:44:37.826Z","test_id":"route-to-encouragement-father","dataset":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]}],"assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]}
-{"timestamp":"2026-02-20T21:44:40.629Z","test_id":"route-to-encouragement-job","dataset":"dataset-routing","score":0.875,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0.75,"weight":1,"verdict":"fail","assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}],"assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}
-{"timestamp":"2026-02-20T21:44:43.409Z","test_id":"route-to-listening","dataset":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]}],"assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]}
-{"timestamp":"2026-02-20T21:44:58.472Z","test_id":"route-to-listening-gatekeeper","dataset":"dataset-routing","score":0.5,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]}],"assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]}
+{"timestamp":"2026-02-20T21:44:37.826Z","test_id":"route-to-encouragement-father","suite":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]}],"assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]}
+{"timestamp":"2026-02-20T21:44:40.629Z","test_id":"route-to-encouragement-job","suite":"dataset-routing","score":0.875,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0.75,"weight":1,"verdict":"fail","assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}],"assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}
+{"timestamp":"2026-02-20T21:44:43.409Z","test_id":"route-to-listening","suite":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]}],"assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]}
+{"timestamp":"2026-02-20T21:44:58.472Z","test_id":"route-to-listening-gatekeeper","suite":"dataset-routing","score":0.5,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]}],"assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]}
diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl
index 93556a449..ba5500a57 100644
--- a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl
+++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.baseline.jsonl
@@ -1,4 +1,4 @@
-{"timestamp":"2026-02-20T21:44:59.088Z","test_id":"tool-selection-demo","dataset":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true},{"text":"fetch: called 1 times (required \u22651)","passed":true}]},{"name":"selection-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"Tool 'fetch' appears relevant to task","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"selection-quality: Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"fetch: called 1 times (required \u22651)","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'fetch' appears relevant to task","passed":true}]}
-{"timestamp":"2026-02-20T21:44:59.093Z","test_id":"efficiency-demo","dataset":"tool-eval-demo","score":0.93,"target":"mock_agent","scores":[{"name":"efficiency-check","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"efficiency-check: Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]}
-{"timestamp":"2026-02-20T21:44:59.155Z","test_id":"combined-evaluation","dataset":"tool-eval-demo","score":0.7766666666666667,"target":"mock_agent","scores":[{"name":"workflow-trajectory","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true}]},{"name":"selection-check","type":"code-grader","score":0.4,"weight":1,"verdict":"fail","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found."},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false}]},{"name":"efficiency","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (3) within budget (10)","passed":true,"evidence":"Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Found search at position 0","passed":true,"evidence":"selection-check: Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found. | efficiency: Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool calls (3) within budget (10)","passed":true},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]}
-{"timestamp":"2026-02-20T21:44:59.216Z","test_id":"pairwise-demo","dataset":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"pairwise-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]}],"assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"pairwise-quality: Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]}
+{"timestamp":"2026-02-20T21:44:59.088Z","test_id":"tool-selection-demo","suite":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"trajectory-check","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"search: called 1 times (required \u22651)","passed":true},{"text":"fetch: called 1 times (required \u22651)","passed":true}]},{"name":"selection-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"Tool 'fetch' appears relevant to task","passed":true}]}],"assertions":[{"text":"search: called 1 times (required \u22651)","passed":true,"evidence":"selection-quality: Evaluated 2 tool(s) against task requirements. 2 appropriate, 0 issues found."},{"text":"fetch: called 1 times (required \u22651)","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'fetch' appears relevant to task","passed":true}]}
+{"timestamp":"2026-02-20T21:44:59.093Z","test_id":"efficiency-demo","suite":"tool-eval-demo","score":0.93,"target":"mock_agent","scores":[{"name":"efficiency-check","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Tool calls (1) within budget (10)","passed":true,"evidence":"efficiency-check: Task complexity: simple. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (40) within budget","passed":true},{"text":"Cost ($0.0003) within budget","passed":true},{"text":"High exploration ratio: 1.00 (target: 0.60)","passed":false}]}
+{"timestamp":"2026-02-20T21:44:59.155Z","test_id":"combined-evaluation","suite":"tool-eval-demo","score":0.7766666666666667,"target":"mock_agent","scores":[{"name":"workflow-trajectory","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true}]},{"name":"selection-check","type":"code-grader","score":0.4,"weight":1,"verdict":"fail","assertions":[{"text":"Tool 'search' appears relevant to task","passed":true,"evidence":"Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found."},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false}]},{"name":"efficiency","type":"code-grader","score":0.93,"weight":1,"verdict":"pass","assertions":[{"text":"Tool calls (3) within budget (10)","passed":true,"evidence":"Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]}],"assertions":[{"text":"Found search at position 0","passed":true,"evidence":"selection-check: Evaluated 3 tool(s) against task requirements. 2 appropriate, 3 issues found. | efficiency: Task complexity: complex. Evaluated 4 criteria. Score: 0.93"},{"text":"Found validate at position 1","passed":true},{"text":"Found process at position 2","passed":true},{"text":"Tool 'search' appears relevant to task","passed":true},{"text":"Tool 'validate' appears relevant to task","passed":true},{"text":"Tool calls (3) within budget (10)","passed":true},{"text":"Token usage (475) within budget","passed":true},{"text":"Cost ($0.0032) within budget","passed":true},{"text":"Tool 'process' may not be needed for this task","passed":false},{"text":"Expected a 'write'-type tool but none used","passed":false},{"text":"Expected a 'analyze'-type tool but none used","passed":false},{"text":"Low exploration ratio: 0.33 (target: 0.60)","passed":false}]}
+{"timestamp":"2026-02-20T21:44:59.216Z","test_id":"pairwise-demo","suite":"tool-eval-demo","score":1,"target":"mock_agent","scores":[{"name":"pairwise-quality","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]}],"assertions":[{"text":"More diverse tools: 2 types","passed":true,"evidence":"pairwise-quality: Pass 1: A wins. Pass 2 (swapped): B wins (maps to A). Consistency: true. Final: A (high confidence)"},{"text":"Response A used tools; B did not","passed":true}]}
diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts
index 6d5e08a5e..f426b155b 100644
--- a/packages/core/src/evaluation/loaders/jsonl-parser.ts
+++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts
@@ -34,7 +34,7 @@ function matchesFilter(id: string, filter: string | readonly string[]): boolean
 }
 
 /**
- * Sidecar metadata structure for JSONL datasets.
+ * Sidecar metadata structure for JSONL suites.
  */
 type SidecarMetadata = {
   readonly description?: string;
@@ -73,7 +73,7 @@ export function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills
 }
 
 /**
- * Load sidecar YAML metadata file for a JSONL dataset.
+ * Load sidecar YAML metadata file for a JSONL suite.
  */
 async function loadSidecarMetadata(jsonlPath: string, verbose: boolean): Promise<SidecarMetadata> {
   const dir = path.dirname(jsonlPath);
@@ -158,10 +158,10 @@ export async function loadTestsFromJsonl(
   const rawFile = await readFile(absoluteTestPath, 'utf8');
   const rawCases = parseJsonlContent(rawFile, evalFilePath);
 
-  // Derive dataset name: sidecar > filename
-  const fallbackDatasetName = path.basename(absoluteTestPath, '.jsonl') || 'eval';
-  const datasetName =
-    sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackDatasetName;
+  // Derive suite name: sidecar > filename
+  const fallbackSuiteName = path.basename(absoluteTestPath, '.jsonl') || 'eval';
+  const suiteName =
+    sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
 
   // Global defaults from sidecar
   const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar') ?? 'llm-grader';
@@ -170,7 +170,7 @@ export async function loadTestsFromJsonl(
   if (verbose) {
     console.log(`\n[JSONL Dataset: ${evalFilePath}]`);
     console.log(`  Cases: ${rawCases.length}`);
-    console.log(`  Dataset: ${datasetName}`);
+    console.log(`  Suite: ${suiteName}`);
     if (sidecar.description) {
       console.log(`  Description: ${sidecar.description}`);
     }
@@ -302,7 +302,7 @@ export async function loadTestsFromJsonl(
 
     const testCase: EvalTest = {
       id,
-      dataset: datasetName,
+      suite: suiteName,
       conversation_id: conversationId,
       question: question,
       input: inputMessages,
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index f5ad8060f..68191dd11 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -829,7 +829,7 @@ export async function runEvaluation(
           const budgetResult: EvaluationResult = {
             timestamp: (now ?? (() => new Date()))().toISOString(),
             testId: evalCase.id,
-            dataset: evalCase.dataset,
+            suite: evalCase.suite,
             category: evalCase.category,
             score: 0,
             assertions: [],
@@ -869,7 +869,7 @@ export async function runEvaluation(
           const haltResult: EvaluationResult = {
             timestamp: (now ?? (() => new Date()))().toISOString(),
             testId: evalCase.id,
-            dataset: evalCase.dataset,
+            suite: evalCase.suite,
             category: evalCase.category,
             score: 0,
             assertions: [],
@@ -2146,7 +2146,7 @@ async function evaluateCandidate(options: {
   return {
     timestamp: completedAt.toISOString(),
     testId: evalCase.id,
-    dataset: evalCase.dataset,
+    suite: evalCase.suite,
     category: evalCase.category,
     conversationId: evalCase.conversation_id,
     score: score.score,
@@ -2641,7 +2641,7 @@ function buildErrorResult(
   return {
     timestamp: timestamp.toISOString(),
     testId: evalCase.id,
-    dataset: evalCase.dataset,
+    suite: evalCase.suite,
     category: evalCase.category,
     conversationId: evalCase.conversation_id,
     score: 0,
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index a554ac85b..51d2841ae 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -771,7 +771,7 @@ export type EvaluatorConfig =
  */
 export interface EvalTest {
   readonly id: string;
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly category?: string;
   readonly conversation_id?: string;
   readonly question: string;
@@ -894,7 +894,7 @@ export type FailOnError = boolean;
 export interface EvaluationResult {
   readonly timestamp: string;
   readonly testId: string;
-  readonly dataset?: string;
+  readonly suite?: string;
   readonly category?: string;
   readonly conversationId?: string;
   readonly score: number;
diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts
index 0bdd5bce8..9a0686cca 100644
--- a/packages/core/src/evaluation/validation/eval-file.schema.ts
+++ b/packages/core/src/evaluation/validation/eval-file.schema.ts
@@ -345,7 +345,7 @@ const EvalTestSchema = z.object({
   workspace: WorkspaceSchema.optional(),
   metadata: z.record(z.unknown()).optional(),
   conversation_id: z.string().optional(),
-  dataset: z.string().optional(),
+  suite: z.string().optional(),
   note: z.string().optional(),
 });
 
diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts
index cdf00fe4d..d1ce1aa5c 100644
--- a/packages/core/src/evaluation/validation/eval-validator.ts
+++ b/packages/core/src/evaluation/validation/eval-validator.ts
@@ -65,7 +65,7 @@ const KNOWN_TEST_FIELDS = new Set([
   'workspace',
   'metadata',
   'conversation_id',
-  'dataset',
+  'suite',
   'note',
 ]);
 
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 1117dc7ed..24f0c65bd 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -273,16 +273,14 @@ async function loadTestsFromYaml(
   }
 
   const suite = interpolated as RawTestSuite;
-  const datasetNameFromSuite = asString(suite.name)?.trim();
-  const fallbackDatasetName =
+  const suiteNameFromFile = asString(suite.name)?.trim();
+  const fallbackSuiteName =
     path
       .basename(absoluteTestPath)
       .replace(/\.eval\.ya?ml$/i, '')
       .replace(/\.ya?ml$/i, '') || 'eval';
-  const datasetName =
-    datasetNameFromSuite && datasetNameFromSuite.length > 0
-      ? datasetNameFromSuite
-      : fallbackDatasetName;
+  const suiteName =
+    suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
 
   const rawTestCases = resolveTests(suite);
 
@@ -490,7 +488,7 @@ async function loadTestsFromYaml(
 
     const testCase: EvalTest = {
       id,
-      dataset: datasetName,
+      suite: suiteName,
       category: options?.category,
       conversation_id: conversationId,
       question: question,
diff --git a/packages/core/src/observability/otel-exporter.ts b/packages/core/src/observability/otel-exporter.ts
index de08959ed..73f1a98b1 100644
--- a/packages/core/src/observability/otel-exporter.ts
+++ b/packages/core/src/observability/otel-exporter.ts
@@ -182,7 +182,7 @@ export class OtelTraceExporter {
         // Core attributes
         rootSpan.setAttribute('agentv.test_id', result.testId);
         rootSpan.setAttribute('agentv.target', result.target);
-        if (result.dataset) rootSpan.setAttribute('agentv.dataset', result.dataset);
+        if (result.suite) rootSpan.setAttribute('agentv.suite', result.suite);
         rootSpan.setAttribute('agentv.score', result.score);
         if (captureContent && result.output.length > 0) {
           const lastMsg = result.output[result.output.length - 1];
@@ -455,7 +455,7 @@ export class OtelStreamingObserver {
     this.rootSpan.setAttribute('gen_ai.system', 'agentv');
     this.rootSpan.setAttribute('agentv.test_id', testId);
     this.rootSpan.setAttribute('agentv.target', target);
-    if (evalSet) this.rootSpan.setAttribute('agentv.dataset', evalSet);
+    if (evalSet) this.rootSpan.setAttribute('agentv.suite', evalSet);
     this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
   }
 
diff --git a/packages/core/test/evaluation/baseline.test.ts b/packages/core/test/evaluation/baseline.test.ts
index bebc0318d..2174f1eb4 100644
--- a/packages/core/test/evaluation/baseline.test.ts
+++ b/packages/core/test/evaluation/baseline.test.ts
@@ -6,7 +6,7 @@ function makeFullResult(overrides: Partial<EvaluationResult> = {}): EvaluationRe
   return {
     timestamp: '2026-01-01T00:00:00.000Z',
     testId: 'test-case',
-    dataset: 'test-dataset',
+    suite: 'test-dataset',
     conversationId: 'conv-1',
     score: 0.85,
     assertions: [
@@ -56,7 +56,7 @@ describe('trimBaselineResult', () => {
 
     expect(trimmed.timestamp).toBe(full.timestamp);
     expect(trimmed.testId).toBe(full.testId);
-    expect(trimmed.dataset).toBe(full.dataset);
+    expect(trimmed.suite).toBe(full.suite);
     expect(trimmed.conversationId).toBe(full.conversationId);
     expect(trimmed.score).toBe(full.score);
     expect(trimmed.assertions).toEqual(full.assertions);
diff --git a/packages/core/test/evaluation/code-evaluator-file-backed.test.ts b/packages/core/test/evaluation/code-evaluator-file-backed.test.ts
index e5e513471..6bae33521 100644
--- a/packages/core/test/evaluation/code-evaluator-file-backed.test.ts
+++ b/packages/core/test/evaluation/code-evaluator-file-backed.test.ts
@@ -9,7 +9,7 @@ import type { EvalTest } from '../../src/evaluation/types.js';
 
 const baseTestCase: EvalTest = {
   id: 'case-1',
-  dataset: 'test-dataset',
+  suite: 'test-dataset',
   question: 'Test question',
   input: [{ role: 'user', content: 'Test input' }],
   expected_output: [],
diff --git a/packages/core/test/evaluation/code-evaluator-multimodal.test.ts b/packages/core/test/evaluation/code-evaluator-multimodal.test.ts
index 29f14b75b..78784e6f2 100644
--- a/packages/core/test/evaluation/code-evaluator-multimodal.test.ts
+++ b/packages/core/test/evaluation/code-evaluator-multimodal.test.ts
@@ -10,7 +10,7 @@ import type { EvalTest } from '../../src/evaluation/types.js';
 
 const baseTestCase: EvalTest = {
   id: 'case-mm',
-  dataset: 'test-dataset',
+  suite: 'test-dataset',
   question: 'Test question',
   input: [{ role: 'user', content: 'Describe this image' }],
   expected_output: [],
diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts
index 38ba1c82f..42dabec24 100644
--- a/packages/core/test/evaluation/evaluators.test.ts
+++ b/packages/core/test/evaluation/evaluators.test.ts
@@ -79,7 +79,7 @@ class SequenceCapturingProvider implements Provider {
 
 const baseTestCase: EvalTest = {
   id: 'case-1',
-  dataset: 'test-dataset',
+  suite: 'test-dataset',
   question: 'Improve the logging implementation',
   input: [{ role: 'user', content: 'Please add logging' }],
   expected_output: [],
diff --git a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts
index 33fb96d11..1034bcbd0 100644
--- a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts
+++ b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts
@@ -12,7 +12,7 @@ import type { EvalTest, EvaluatorConfig } from '../../../src/evaluation/types.js
 
 const baseTestCase: EvalTest = {
   id: 'threshold-test',
-  dataset: 'test',
+  suite: 'test',
   question: 'Test question',
   input: [{ role: 'user', content: 'Test' }],
   expected_output: [],
diff --git a/packages/core/test/evaluation/evaluators/execution-metrics.test.ts b/packages/core/test/evaluation/evaluators/execution-metrics.test.ts
index 3f6ed8e71..0a53f6671 100644
--- a/packages/core/test/evaluation/evaluators/execution-metrics.test.ts
+++ b/packages/core/test/evaluation/evaluators/execution-metrics.test.ts
@@ -7,7 +7,7 @@ import type { EvalTest, ExecutionMetricsEvaluatorConfig } from '../../../src/eva
 
 const baseTestCase: EvalTest = {
   id: 'metrics-test',
-  dataset: 'test',
+  suite: 'test',
   question: 'Test question',
   input: [{ role: 'user', content: 'Test' }],
   expected_output: [],
diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts
index 5eeda30de..d7ea0fdbd 100644
--- a/packages/core/test/evaluation/evaluators_variables.test.ts
+++ b/packages/core/test/evaluation/evaluators_variables.test.ts
@@ -25,7 +25,7 @@ class CapturingProvider implements Provider {
 
 const baseTestCase: EvalTest = {
   id: 'case-1',
-  dataset: 'test-dataset',
+  suite: 'test-dataset',
   question: 'Original Question Text',
   input: [{ role: 'user', content: [{ type: 'text', value: 'Input Message' }] }],
   expected_output: [{ type: 'text', value: 'Expected Output Message' }],
diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts
index 615f8229b..aa0efecba 100644
--- a/packages/core/test/evaluation/execution-metrics.test.ts
+++ b/packages/core/test/evaluation/execution-metrics.test.ts
@@ -243,7 +243,7 @@ describe('Execution Metrics', () => {
 describe('Code Grader Metrics Integration', () => {
   const baseTestCase: EvalTest = {
     id: 'metrics-test',
-    dataset: 'test',
+    suite: 'test',
     question: 'Test question',
     input: [{ role: 'user', content: 'Test' }],
     expected_output: [],
diff --git a/packages/core/test/evaluation/execution-status.test.ts b/packages/core/test/evaluation/execution-status.test.ts
index 7f2028998..e5a9ca1ea 100644
--- a/packages/core/test/evaluation/execution-status.test.ts
+++ b/packages/core/test/evaluation/execution-status.test.ts
@@ -40,7 +40,7 @@ class FixedResponseProvider implements Provider {
 
 const baseTestCase: EvalTest = {
   id: 'exec-status-1',
-  dataset: 'test-dataset',
+  suite: 'test-dataset',
   question: 'Explain logging improvements',
   input: [{ role: 'user', content: 'Explain logging improvements' }],
   expected_output: [],
diff --git a/packages/core/test/evaluation/llm-grader-multimodal.test.ts b/packages/core/test/evaluation/llm-grader-multimodal.test.ts
index 093733927..1ff035a02 100644
--- a/packages/core/test/evaluation/llm-grader-multimodal.test.ts
+++ b/packages/core/test/evaluation/llm-grader-multimodal.test.ts
@@ -55,7 +55,7 @@ const { LlmGraderEvaluator } = await import('../../src/evaluation/evaluators.js'
 
 const baseTestCase: EvalTest = {
   id: 'mm-case-1',
-  dataset: 'test-dataset',
+  suite: 'test-dataset',
   question: 'Describe the image',
   input: [{ role: 'user', content: 'What is in this image?' }],
   expected_output: [],
diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
index 5a1b2e4b3..1285d91b5 100644
--- a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
+++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
@@ -9,7 +9,7 @@ import { loadTests } from '../../../src/evaluation/yaml-parser.js';
 describe('detectFormat', () => {
   it('returns jsonl for .jsonl extension', () => {
     expect(detectFormat('test.jsonl')).toBe('jsonl');
-    expect(detectFormat('/path/to/dataset.jsonl')).toBe('jsonl');
+    expect(detectFormat('/path/to/suite.jsonl')).toBe('jsonl');
   });
 
   it('returns yaml for .yaml extension', () => {
@@ -156,7 +156,7 @@ describe('loadTestsFromJsonl', () => {
     const cases = await loadTestsFromJsonl(jsonlPath, tempDir);
 
     expect(cases).toHaveLength(1);
-    expect(cases[0].dataset).toBe('my-tests');
+    expect(cases[0].suite).toBe('my-tests');
     expect(cases[0].evaluator).toBe('llm-grader');
   });
 
@@ -178,7 +178,7 @@ describe('loadTestsFromJsonl', () => {
     );
   });
 
-  it('uses default dataset name from filename when no sidecar', async () => {
+  it('uses default suite name from filename when no sidecar', async () => {
     const jsonlPath = path.join(tempDir, 'my-dataset.jsonl');
     await writeFile(
       jsonlPath,
@@ -188,7 +188,7 @@ describe('loadTestsFromJsonl', () => {
     const cases = await loadTestsFromJsonl(jsonlPath, tempDir);
 
     expect(cases).toHaveLength(1);
-    expect(cases[0].dataset).toBe('my-dataset');
+    expect(cases[0].suite).toBe('my-dataset');
   });
 
   it('supports per-case evaluators override', async () => {
@@ -430,7 +430,7 @@ tests:
     // Core fields should match
     expect(jsonlCases[0].id).toBe(yamlCases[0].id);
     expect(jsonlCases[0].criteria).toBe(yamlCases[0].criteria);
-    expect(jsonlCases[0].dataset).toBe(yamlCases[0].dataset);
+    expect(jsonlCases[0].suite).toBe(yamlCases[0].suite);
     expect(jsonlCases[0].input.length).toBe(yamlCases[0].input.length);
     expect(jsonlCases[0].input[0].role).toBe(yamlCases[0].input[0].role);
     expect(jsonlCases[0].input[0].content).toBe(yamlCases[0].input[0].content);
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index a51340761..5405e3242 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -112,7 +112,7 @@ class CapturingCliProvider implements Provider {
 
 const baseTestCase: EvalTest = {
   id: 'case-1',
-  dataset: 'test-dataset',
+  suite: 'test-dataset',
   question: 'Explain logging improvements',
   input: [{ role: 'user', content: 'Explain logging improvements' }],
   expected_output: [],
@@ -517,7 +517,7 @@ describe('runTestCase', () => {
     const result = await runEvalCase({
       evalCase: {
         id: 'multi',
-        dataset: 'ds',
+        suite: 'ds',
         question: '',
         input: [
           { role: 'system', content: 'Guide' },
@@ -562,7 +562,7 @@ describe('runTestCase', () => {
     await runEvalCase({
       evalCase: {
         id: 'single',
-        dataset: 'ds',
+        suite: 'ds',
         question: '',
         input: [{ role: 'user', content: 'Hello' }],
         expected_output: [],
@@ -682,7 +682,7 @@ class TraceProvider implements Provider {
 describe('runEvalCase trace integration', () => {
   const traceTestCase: EvalTest = {
     id: 'trace-case',
-    dataset: 'trace-dataset',
+    suite: 'trace-dataset',
     question: 'What is the weather?',
     input: [{ role: 'user', content: 'What is the weather?' }],
     expected_output: [],
@@ -1694,7 +1694,7 @@ rl.on('close', () => {
 describe('deterministic assertion evaluators in orchestrator', () => {
   const assertionTestCase: EvalTest = {
     id: 'assert-1',
-    dataset: 'test-dataset',
+    suite: 'test-dataset',
     question: 'Test question',
     input: [{ role: 'user', content: 'Test question' }],
     expected_output: [],
@@ -1871,7 +1871,7 @@ describe('deterministic assertion evaluators in orchestrator', () => {
 describe('criteria with assert runs only declared evaluators (#452)', () => {
   const criteriaTestCase: EvalTest = {
     id: 'no-implicit-grader-1',
-    dataset: 'test-dataset',
+    suite: 'test-dataset',
     question: 'Test question',
     input: [{ role: 'user', content: 'Test question' }],
     expected_output: [],
@@ -1972,7 +1972,7 @@ describe('criteria with assert runs only declared evaluators (#452)', () => {
 describe('required gates', () => {
   const assertionTestCase: EvalTest = {
     id: 'required-gate-1',
-    dataset: 'test-dataset',
+    suite: 'test-dataset',
     question: 'Test question',
     input: [{ role: 'user', content: 'Test question' }],
     expected_output: [],
diff --git a/packages/core/test/observability/streaming-observer.test.ts b/packages/core/test/observability/streaming-observer.test.ts
index 7acc36dcc..ae4102aa4 100644
--- a/packages/core/test/observability/streaming-observer.test.ts
+++ b/packages/core/test/observability/streaming-observer.test.ts
@@ -75,13 +75,13 @@ describe('OtelStreamingObserver', () => {
     const spans: MockSpan[] = [];
     const observer = new OtelStreamingObserver(createMockTracer(spans), createMockApi(), false);
 
-    observer.startEvalCase('test-1', 'my-target', 'my-dataset');
+    observer.startEvalCase('test-1', 'my-target', 'my-suite');
 
     expect(spans).toHaveLength(1);
     expect(spans[0].name).toBe('agentv.eval');
     expect(spans[0].attributes['agentv.test_id']).toBe('test-1');
     expect(spans[0].attributes['agentv.target']).toBe('my-target');
-    expect(spans[0].attributes['agentv.dataset']).toBe('my-dataset');
+    expect(spans[0].attributes['agentv.suite']).toBe('my-suite');
     expect(spans[0].attributes['gen_ai.system']).toBe('agentv');
     expect(spans[0].ended).toBe(false);
   });
@@ -191,7 +191,7 @@ describe('OtelStreamingObserver', () => {
     const spans: MockSpan[] = [];
     const observer = new OtelStreamingObserver(createMockTracer(spans), createMockApi(), true);
 
-    observer.startEvalCase('lifecycle-test', 'claude-target', 'qa-dataset');
+    observer.startEvalCase('lifecycle-test', 'claude-target', 'qa-suite');
     observer.onToolCall('search', { q: 'test' }, ['result1'], 200, 'tc-a');
     observer.onLlmCall('claude-sonnet-4-20250514', { input: 500, output: 100 });
     observer.onToolCall('write', { path: 'out.txt' }, 'ok', 50, 'tc-b');
diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/analyzer.md b/plugins/agentv-dev/skills/agentv-bench/agents/analyzer.md
index dcc2a12d3..d1e363eaf 100644
--- a/plugins/agentv-dev/skills/agentv-bench/agents/analyzer.md
+++ b/plugins/agentv-dev/skills/agentv-bench/agents/analyzer.md
@@ -21,7 +21,7 @@ You are an eval-quality analyst for AgentV. Your job is to read JSONL evaluation
 ### Step 1: Load Results
 
 Read every line of the JSONL results file. Each line is a JSON object with:
-- `test_id`, `dataset`, `score`, `assertions`, `reasoning`, `target`
+- `test_id`, `suite`, `score`, `assertions`, `reasoning`, `target`
 - `scores` (optional): Array of per-evaluator breakdowns with `name`, `type`, `score`, `weight`, `verdict`, `assertions`, `reasoning`
 
 If `eval-path` is provided, also read the EVAL.yaml to understand evaluator configurations.
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
index 1894d2f14..86d724022 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
@@ -105,7 +105,7 @@ tests:
 ## Eval File Structure
 
 **Required:** `tests` (array or string path)
-**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `dataset`, `workspace`, `assertions`, `input`
+**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `suite`, `workspace`, `assertions`, `input`
 
 **Test fields:**
 
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
index e0fdb12b1..b7c7b6566 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
@@ -4412,7 +4412,7 @@
                   "conversation_id": {
                     "type": "string"
                   },
-                  "dataset": {
+                  "suite": {
                     "type": "string"
                   },
                   "note": {
@@ -8748,7 +8748,7 @@
                   "conversation_id": {
                     "type": "string"
                   },
-                  "dataset": {
+                  "suite": {
                     "type": "string"
                   },
                   "note": {
diff --git a/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md b/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md
index cc4d72651..2f75cedc2 100644
--- a/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md
+++ b/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md
@@ -23,7 +23,7 @@ agentv trace list [--limit N] [--format json|table]
 agentv trace show <result-file> [--test-id <id>] [--tree] [--format json|table]
 
 # Percentile statistics
-agentv trace stats <result-file> [--group-by target|dataset|test-id] [--format json|table]
+agentv trace stats <result-file> [--group-by target|suite|test-id] [--format json|table]
 
 # A/B comparison between runs
 agentv compare <baseline.jsonl> <candidate.jsonl> [--threshold 0.1] [--format json|table]
@@ -95,8 +95,8 @@ Look for:
 # By target provider
 agentv trace stats <result-file> --group-by target
 
-# By dataset
-agentv trace stats <result-file> --group-by dataset
+# By suite
+agentv trace stats <result-file> --group-by suite
 ```
 
 Compare providers side-by-side: which is cheaper, faster, more accurate?
@@ -114,9 +114,9 @@ agentv trace show <result-file> --format json \
 agentv trace show <result-file> --format json \
   | jq '[.[] | select(.token_usage.input + .token_usage.output > 10000) | {test_id, tokens: (.token_usage.input + .token_usage.output)}]'
 
-# Score distribution by dataset
+# Score distribution by suite
 agentv trace show <result-file> --format json \
-  | jq 'group_by(.dataset) | .[] | {dataset: .[0].dataset, count: length, avg_score: ([.[].score] | add / length)}'
+  | jq 'group_by(.suite) | .[] | {suite: .[0].suite, count: length, avg_score: ([.[].score] | add / length)}'
 
 # Tool usage frequency across all tests
 agentv trace show <result-file> --format json \
@@ -133,7 +133,7 @@ When analyzing traces, think about:
 
 1. **Efficiency**: Are tool calls/tokens proportional to task complexity? High tokens-per-tool may indicate verbose prompts or unnecessary context.
 
-2. **Error patterns**: Do failures cluster by target, dataset, or tool usage? Common patterns:
+2. **Error patterns**: Do failures cluster by target, suite, or tool usage? Common patterns:
    - Tool errors → agent can't access required resources
    - High LLM calls with low tool calls → agent stuck in reasoning loop
    - Missing tool calls → wrong tool routing

From ed93928b7615f15fba78596696b36644a845ae03 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Sun, 5 Apr 2026 09:34:46 +0000
Subject: [PATCH 2/4] fix(test): use platform-agnostic paths in pi-coding-agent
 tests

The tests used hardcoded Windows backslash paths which fail on Linux.
Use path.join and path.sep for cross-platform compatibility.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../providers/pi-coding-agent.test.ts         | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/packages/core/test/evaluation/providers/pi-coding-agent.test.ts b/packages/core/test/evaluation/providers/pi-coding-agent.test.ts
index 0fb5041ad..d46fc1768 100644
--- a/packages/core/test/evaluation/providers/pi-coding-agent.test.ts
+++ b/packages/core/test/evaluation/providers/pi-coding-agent.test.ts
@@ -49,18 +49,33 @@ describe('PiCodingAgentProvider', () => {
   });
 
   it('builds the expected global npm module entry path', () => {
+    const { join } = require('node:path');
     expect(
       _internal.buildGlobalModuleEntry(
         '@mariozechner/pi-coding-agent',
-        'C:\\npm-global\\node_modules',
+        join('C:', 'npm-global', 'node_modules'),
       ),
-    ).toBe('C:\\npm-global\\node_modules\\@mariozechner\\pi-coding-agent\\dist\\index.js');
+    ).toBe(
+      join(
+        'C:',
+        'npm-global',
+        'node_modules',
+        '@mariozechner',
+        'pi-coding-agent',
+        'dist',
+        'index.js',
+      ),
+    );
     expect(
-      _internal.buildGlobalModuleEntry('@mariozechner/pi-ai', 'C:\\npm-global\\node_modules'),
-    ).toBe('C:\\npm-global\\node_modules\\@mariozechner\\pi-ai\\dist\\index.js');
+      _internal.buildGlobalModuleEntry(
+        '@mariozechner/pi-ai',
+        join('C:', 'npm-global', 'node_modules'),
+      ),
+    ).toBe(join('C:', 'npm-global', 'node_modules', '@mariozechner', 'pi-ai', 'dist', 'index.js'));
   });
 
   it('finds the agentv package root', () => {
-    expect(_internal.findAgentvRoot().endsWith('packages\\core')).toBe(true);
+    const { sep } = require('node:path');
+    expect(_internal.findAgentvRoot().endsWith(`packages${sep}core`)).toBe(true);
   });
 });

From c44dca6b956ecbf5a7a3d13357c676ab0d1fa40d Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Sun, 5 Apr 2026 09:37:51 +0000
Subject: [PATCH 3/4] docs: add Suites section to eval files documentation

Explains the suite concept as requested in #943 comment.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/web/src/content/docs/docs/evaluation/eval-files.mdx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
index 3a8ec9b30..521de9814 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
@@ -7,6 +7,10 @@ sidebar:
 
 Evaluation files define the test cases, targets, and evaluators for an evaluation run. AgentV supports two formats: YAML and JSONL.
 
+## Suites
+
+An eval file is a **suite**: it binds test cases to execution context (workspace, hooks, targets, trials). Test cases can be inline or loaded from an external file via `tests: ./cases.yaml` for reuse across suites.
+
 ## YAML Format
 
 The primary format. A single file contains metadata, execution config, and tests:

From 649edf65ac70d365e386e7da054742cdaff94c42 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Sun, 5 Apr 2026 09:48:43 +0000
Subject: [PATCH 4/4] =?UTF-8?q?fix:=20address=20code=20review=20findings?=
 =?UTF-8?q?=20from=20dataset=E2=86=92suite=20rename?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix missed verbose log "JSONL Dataset:" → "JSONL Suite:" in jsonl-parser
- Clean up stale "legacy" comment in discover.ts
- Rename internal datasetFile/datasetFilePath vars in check-eval-baselines script

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/discover.ts        |  2 +-
 .../src/evaluation/loaders/jsonl-parser.ts    |  2 +-
 scripts/check-eval-baselines.ts               | 28 +++++++++----------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/apps/cli/src/commands/eval/discover.ts b/apps/cli/src/commands/eval/discover.ts
index cbb57c44b..0cb116ffb 100644
--- a/apps/cli/src/commands/eval/discover.ts
+++ b/apps/cli/src/commands/eval/discover.ts
@@ -18,7 +18,7 @@ export interface DiscoveredEvalFile {
  *
  * Uses `eval_patterns` from `.agentv/config.yaml` if configured,
  * otherwise falls back to default patterns that match `suite*.yaml`,
- * `eval.yaml`, and legacy `dataset*.yaml` files under `evals/` directories.
+ * `eval.yaml`, and `dataset*.yaml` files under `evals/` directories.
  */
 export async function discoverEvalFiles(cwd: string): Promise<readonly DiscoveredEvalFile[]> {
   const repoRoot = await findRepoRoot(cwd);
diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts
index f426b155b..356f35862 100644
--- a/packages/core/src/evaluation/loaders/jsonl-parser.ts
+++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts
@@ -168,7 +168,7 @@ export async function loadTestsFromJsonl(
   const globalExecution = sidecar.execution;
 
   if (verbose) {
-    console.log(`\n[JSONL Dataset: ${evalFilePath}]`);
+    console.log(`\n[JSONL Suite: ${evalFilePath}]`);
     console.log(`  Cases: ${rawCases.length}`);
     console.log(`  Suite: ${suiteName}`);
     if (sidecar.description) {
diff --git a/scripts/check-eval-baselines.ts b/scripts/check-eval-baselines.ts
index 372d95ee6..0fc1c2ed7 100644
--- a/scripts/check-eval-baselines.ts
+++ b/scripts/check-eval-baselines.ts
@@ -85,8 +85,8 @@ async function findBaselinedYamlFiles(dir: string, results: string[] = []): Prom
   return results;
 }
 
-function baselinePathFor(datasetFilePath: string): string {
-  const absolutePath = path.resolve(datasetFilePath);
+function baselinePathFor(evalFilePath: string): string {
+  const absolutePath = path.resolve(evalFilePath);
   return absolutePath.replace(/\.ya?ml$/, '.baseline.jsonl');
 }
 
@@ -95,7 +95,7 @@ function candidatePathFor(baselinePath: string): string {
   return baselinePath.replace(/\.baseline\.jsonl$/, '.candidate.jsonl');
 }
 
-async function runAgentVEval(datasetFile: string, candidatePath: string): Promise<number> {
+async function runAgentVEval(evalFile: string, candidatePath: string): Promise<number> {
   const env = { ...process.env };
   if (!env.TOOL_EVAL_PLUGINS_DIR) {
     env.TOOL_EVAL_PLUGINS_DIR = path.join(
@@ -106,7 +106,7 @@ async function runAgentVEval(datasetFile: string, candidatePath: string): Promis
     );
   }
 
-  const args = ['bun', 'agentv', 'eval', datasetFile, '--out', candidatePath];
+  const args = ['bun', 'agentv', 'eval', evalFile, '--out', candidatePath];
   const proc = Bun.spawn(args, {
     cwd: repoRoot,
     stdout: 'inherit',
@@ -142,8 +142,8 @@ function cleanupCandidate(candidatePath: string): void {
   }
 }
 
-async function processDatasetFile(
-  datasetFile: string,
+async function processEvalFile(
+  evalFile: string,
   baselinePath: string,
   options: CliOptions,
 ): Promise<{ success: boolean; updated: boolean; created: boolean }> {
@@ -151,8 +151,8 @@ async function processDatasetFile(
   const candidatePath = candidatePathFor(baselinePath);
   const baselineExists = existsSync(baselinePath);
 
-  console.log(`\nRunning: ${path.relative(repoRoot, datasetFile)}`);
-  const exitCode = await runAgentVEval(datasetFile, candidatePath);
+  console.log(`\nRunning: ${path.relative(repoRoot, evalFile)}`);
+  const exitCode = await runAgentVEval(evalFile, candidatePath);
   if (exitCode !== 0) {
     cleanupCandidate(candidatePath);
     return { success: false, updated: false, created: false };
@@ -210,18 +210,18 @@ async function main(): Promise<void> {
   }
 
   // Collect dataset file → baseline path pairs
-  const pairs: Array<{ datasetFile: string; baselinePath: string }> = [];
+  const pairs: Array<{ evalFile: string; baselinePath: string }> = [];
 
   if (options.evalFile) {
     const absPath = path.resolve(options.evalFile);
-    pairs.push({ datasetFile: absPath, baselinePath: baselinePathFor(absPath) });
+    pairs.push({ evalFile: absPath, baselinePath: baselinePathFor(absPath) });
   } else {
     // Discover eval YAML files: by naming convention + by existing baselines
     const byConvention = await findEvalYamlFiles(examplesRoot);
     const byBaseline = await findBaselinedYamlFiles(examplesRoot);
     const allDatasetFiles = [...new Set([...byConvention, ...byBaseline])];
     for (const df of allDatasetFiles) {
-      pairs.push({ datasetFile: df, baselinePath: baselinePathFor(df) });
+      pairs.push({ evalFile: df, baselinePath: baselinePathFor(df) });
     }
 
     if (pairs.length === 0) {
@@ -234,10 +234,10 @@ async function main(): Promise<void> {
   let updatedCount = 0;
   let createdCount = 0;
 
-  for (const { datasetFile, baselinePath } of pairs.sort((a, b) =>
-    a.datasetFile.localeCompare(b.datasetFile),
+  for (const { evalFile, baselinePath } of pairs.sort((a, b) =>
+    a.evalFile.localeCompare(b.evalFile),
   )) {
-    const result = await processDatasetFile(datasetFile, baselinePath, options);
+    const result = await processEvalFile(evalFile, baselinePath, options);
     if (!result.success) failures += 1;
     if (result.updated) updatedCount += 1;
     if (result.created) createdCount += 1;