diff --git a/AGENTS.md b/AGENTS.md index 3bbae5d3..02bfdd49 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -277,7 +277,11 @@ Before marking any branch as ready for review, complete this checklist: 4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types). -5. **Mark PR as ready** only after steps 1-4 have been completed AND red/green UAT evidence is included in the PR. +5. **Live eval verification**: For changes affecting scoring, thresholds, or evaluator behavior, run at least one real eval with a live provider (not `--dry-run`) and verify the output JSONL has correct scores, verdicts, and execution status. + +6. **Studio UX verification**: For changes affecting config, scoring display, or studio API, use `agent-browser` to verify the studio UI still renders and functions correctly (settings page loads, pass/fail indicators are correct, config saves work). + +7. **Mark PR as ready** only after steps 1-6 have been completed AND red/green UAT evidence is included in the PR. ## Documentation Updates diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 03c2e901..63be38f3 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -1,7 +1,7 @@ import { mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; -import type { EvaluationResult, EvaluatorResult } from '@agentv/core'; +import { DEFAULT_THRESHOLD, type EvaluationResult, type EvaluatorResult } from '@agentv/core'; import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; import { RESULT_INDEX_FILENAME } from './result-layout.js'; @@ -118,8 +118,6 @@ export type ResultIndexArtifact = IndexArtifactEntry; // Statistics helpers // --------------------------------------------------------------------------- -const PASS_THRESHOLD = 0.8; - function computeStats(values: readonly number[]): { mean: number; stddev: number } { if (values.length === 0) { return { mean: 0, stddev: 0 }; @@ -135,10 +133,10 @@ function computeStats(values: readonly number[]): { mean: number; stddev: number function computePassRate(result: EvaluationResult): number { const scores = result.scores; if (scores && scores.length > 0) { - const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length; + const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length; return passed / scores.length; } - return (result.score ?? 0) >= PASS_THRESHOLD ? 1.0 : 0.0; + return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1.0 : 0.0; } // --------------------------------------------------------------------------- diff --git a/apps/cli/src/commands/eval/benchmark-writer.ts b/apps/cli/src/commands/eval/benchmark-writer.ts index f1056b6a..562dd8a8 100644 --- a/apps/cli/src/commands/eval/benchmark-writer.ts +++ b/apps/cli/src/commands/eval/benchmark-writer.ts @@ -1,8 +1,6 @@ import { writeFile } from 'node:fs/promises'; -import type { EvaluationResult } from '@agentv/core'; - -const PASS_THRESHOLD = 0.8; +import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core'; interface BenchmarkStats { readonly mean: number; @@ -43,10 +41,10 @@ function computeStats(values: readonly number[]): BenchmarkStats { function computePassRate(result: EvaluationResult): number { const scores = result.scores; if (scores && scores.length > 0) { - const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length; + const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length; return passed / scores.length; } - return result.score >= PASS_THRESHOLD ? 1.0 : 0.0; + return result.score >= DEFAULT_THRESHOLD ? 1.0 : 0.0; } /** diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 1e204b5c..553d1487 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -135,7 +135,7 @@ export function calculateEvaluationSummary( // Count by execution status. When a custom threshold is provided, // recompute passed/failed from raw scores instead of executionStatus - // (which uses the hardcoded PASS_THRESHOLD of 0.8). + // (which uses the hardcoded DEFAULT_THRESHOLD of 0.8). const executionErrorCount = executionErrors.length; const scoreThreshold = options?.threshold; const passedCount = diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index f10a97ab..01c6f6fc 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -1,7 +1,7 @@ import { readFileSync, readdirSync, statSync } from 'node:fs'; import path from 'node:path'; import type { EvaluationResult, TraceSummary } from '@agentv/core'; -import { PASS_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; +import { DEFAULT_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; import { RESULT_INDEX_FILENAME, RESULT_RUNS_DIRNAME, @@ -567,7 +567,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { const results = loadResultFile(filePath); const testCount = results.length; - const passCount = results.filter((r) => r.score >= PASS_THRESHOLD).length; + const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length; const passRate = testCount > 0 ? passCount / testCount : 0; const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0; diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 2ef89726..6bd2cc50 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -282,7 +282,7 @@ function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) { if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const suiteMap = new Map(); for (const r of loaded) { const ds = r.suite ?? r.target ?? 'default'; @@ -311,7 +311,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const categoryMap = new Map< string, { total: number; passed: number; scoreSum: number; suites: Set } @@ -351,7 +351,7 @@ function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) { if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); const suiteMap = new Map(); for (const r of filtered) { @@ -467,7 +467,7 @@ function handleEvalFileContent(c: C, { searchDir }: DataContext) { function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { const metas = listResultFiles(searchDir); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const experimentMap = new Map< string, { @@ -520,7 +520,7 @@ function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { const metas = listResultFiles(searchDir); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const targetMap = new Map< string, { @@ -615,8 +615,8 @@ export function createApp( const body = await c.req.json>(); const current = loadStudioConfig(agentvDir); const updated = { ...current, ...body }; - if (typeof updated.pass_threshold === 'number') { - updated.pass_threshold = Math.min(1, Math.max(0, updated.pass_threshold)); + if (typeof updated.threshold === 'number') { + updated.threshold = Math.min(1, Math.max(0, updated.threshold)); } saveStudioConfig(agentvDir, updated); return c.json(updated); diff --git a/apps/cli/src/commands/results/studio-config.ts b/apps/cli/src/commands/results/studio-config.ts index 14959a9e..4be3c56a 100644 --- a/apps/cli/src/commands/results/studio-config.ts +++ b/apps/cli/src/commands/results/studio-config.ts @@ -10,10 +10,10 @@ * config.yaml format: * required_version: ">=4.2.0" * studio: - * pass_threshold: 0.8 # score >= this value is considered "pass" + * threshold: 0.8 # score >= this value is considered "pass" * - * Backward compat: reads root-level `pass_threshold` if `studio:` section - * is absent (legacy format). On save, always writes under `studio:`. + * Backward compat: reads `studio.pass_threshold` and root-level `pass_threshold` + * as fallback. On save, always writes `threshold` under `studio:`. * * If no config.yaml exists, defaults are used. */ @@ -21,23 +21,23 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; import path from 'node:path'; -import { PASS_THRESHOLD } from '@agentv/core'; +import { DEFAULT_THRESHOLD } from '@agentv/core'; import { parse as parseYaml, stringify as stringifyYaml } from 'yaml'; export interface StudioConfig { - pass_threshold: number; + threshold: number; } const DEFAULTS: StudioConfig = { - pass_threshold: PASS_THRESHOLD, + threshold: DEFAULT_THRESHOLD, }; /** * Load studio config from `config.yaml` in the given `.agentv/` directory. - * Reads from `studio.pass_threshold`, falling back to root-level - * `pass_threshold` for backward compatibility. + * Reads from `studio.threshold`, falling back to `studio.pass_threshold` (legacy), + * then root-level `pass_threshold` (legacy) for backward compatibility. * Returns defaults when the file does not exist or is empty. - * Clamps `pass_threshold` to [0, 1]. + * Clamps `threshold` to [0, 1]. */ export function loadStudioConfig(agentvDir: string): StudioConfig { const configPath = path.join(agentvDir, 'config.yaml'); @@ -53,20 +53,22 @@ export function loadStudioConfig(agentvDir: string): StudioConfig { return { ...DEFAULTS }; } - // Prefer studio.pass_threshold, fall back to root-level pass_threshold (legacy) + // Prefer studio.threshold, fall back to studio.pass_threshold, then root-level pass_threshold const studio = (parsed as Record).studio; - let threshold = DEFAULTS.pass_threshold; + let threshold = DEFAULTS.threshold; if (studio && typeof studio === 'object' && !Array.isArray(studio)) { - const studioThreshold = (studio as Record).pass_threshold; - if (typeof studioThreshold === 'number') { - threshold = studioThreshold; + const studioObj = studio as Record; + if (typeof studioObj.threshold === 'number') { + threshold = studioObj.threshold; + } else if (typeof studioObj.pass_threshold === 'number') { + threshold = studioObj.pass_threshold; } } else if (typeof (parsed as Record).pass_threshold === 'number') { threshold = (parsed as Record).pass_threshold as number; } return { - pass_threshold: Math.min(1, Math.max(0, threshold)), + threshold: Math.min(1, Math.max(0, threshold)), }; } @@ -97,8 +99,14 @@ export function saveStudioConfig(agentvDir: string, config: StudioConfig): void const { pass_threshold: _, ...rest } = existing; existing = rest; - // Merge studio section - existing.studio = { ...config }; + // Clean legacy pass_threshold from studio section if present + const existingStudio = existing.studio; + if (existingStudio && typeof existingStudio === 'object' && !Array.isArray(existingStudio)) { + const { pass_threshold: __, ...studioRest } = existingStudio as Record; + existing.studio = { ...studioRest, ...config }; + } else { + existing.studio = { ...config }; + } const yamlStr = stringifyYaml(existing); writeFileSync(configPath, yamlStr, 'utf-8'); diff --git a/apps/cli/test/commands/results/studio-config.test.ts b/apps/cli/test/commands/results/studio-config.test.ts index 4a8e46e8..cb4ec6f4 100644 --- a/apps/cli/test/commands/results/studio-config.test.ts +++ b/apps/cli/test/commands/results/studio-config.test.ts @@ -3,7 +3,7 @@ import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { PASS_THRESHOLD } from '@agentv/core'; +import { DEFAULT_THRESHOLD } from '@agentv/core'; import { parse as parseYaml } from 'yaml'; import { loadStudioConfig, saveStudioConfig } from '../../../src/commands/results/studio-config.js'; @@ -21,52 +21,67 @@ describe('loadStudioConfig', () => { it('returns defaults when no config.yaml exists', () => { const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(PASS_THRESHOLD); + expect(config.threshold).toBe(DEFAULT_THRESHOLD); }); - it('reads pass_threshold from studio section', () => { + it('reads threshold from studio section', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: 0.6\n'); + const config = loadStudioConfig(tempDir); + expect(config.threshold).toBe(0.6); + }); + + it('reads pass_threshold from studio section as fallback (legacy)', () => { writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 0.6\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0.6); + expect(config.threshold).toBe(0.6); + }); + + it('prefers studio.threshold over studio.pass_threshold', () => { + writeFileSync( + path.join(tempDir, 'config.yaml'), + 'studio:\n threshold: 0.9\n pass_threshold: 0.5\n', + ); + const config = loadStudioConfig(tempDir); + expect(config.threshold).toBe(0.9); }); it('falls back to root-level pass_threshold (legacy)', () => { writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 0.7\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0.7); + expect(config.threshold).toBe(0.7); }); it('prefers studio section over root-level pass_threshold', () => { writeFileSync( path.join(tempDir, 'config.yaml'), - 'pass_threshold: 0.5\nstudio:\n pass_threshold: 0.9\n', + 'pass_threshold: 0.5\nstudio:\n threshold: 0.9\n', ); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0.9); + expect(config.threshold).toBe(0.9); }); - it('clamps pass_threshold to 0 when negative', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: -0.5\n'); + it('clamps threshold to 0 when negative', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: -0.5\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0); + expect(config.threshold).toBe(0); }); - it('clamps pass_threshold to 1 when above 1', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 1.5\n'); + it('clamps threshold to 1 when above 1', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: 1.5\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(1); + expect(config.threshold).toBe(1); }); it('returns defaults for empty config.yaml', () => { writeFileSync(path.join(tempDir, 'config.yaml'), ''); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(PASS_THRESHOLD); + expect(config.threshold).toBe(DEFAULT_THRESHOLD); }); - it('returns defaults when pass_threshold is not a number', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: "high"\n'); + it('returns defaults when threshold is not a number', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: "high"\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(PASS_THRESHOLD); + expect(config.threshold).toBe(DEFAULT_THRESHOLD); }); }); @@ -86,13 +101,13 @@ describe('saveStudioConfig', () => { path.join(tempDir, 'config.yaml'), 'required_version: ">=4.2.0"\neval_patterns:\n - "**/*.eval.yaml"\n', ); - saveStudioConfig(tempDir, { pass_threshold: 0.9 }); + saveStudioConfig(tempDir, { threshold: 0.9 }); const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; expect(parsed.required_version).toBe('>=4.2.0'); expect(parsed.eval_patterns).toEqual(['**/*.eval.yaml']); - expect((parsed.studio as Record).pass_threshold).toBe(0.9); + expect((parsed.studio as Record).threshold).toBe(0.9); }); it('removes legacy root-level pass_threshold on save', () => { @@ -100,29 +115,40 @@ describe('saveStudioConfig', () => { path.join(tempDir, 'config.yaml'), 'required_version: ">=4.2.0"\npass_threshold: 0.8\n', ); - saveStudioConfig(tempDir, { pass_threshold: 0.7 }); + saveStudioConfig(tempDir, { threshold: 0.7 }); const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; expect(parsed.required_version).toBe('>=4.2.0'); expect(parsed.pass_threshold).toBeUndefined(); - expect((parsed.studio as Record).pass_threshold).toBe(0.7); + expect((parsed.studio as Record).threshold).toBe(0.7); + }); + + it('removes legacy pass_threshold from studio section on save', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 0.8\n'); + saveStudioConfig(tempDir, { threshold: 0.7 }); + + const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); + const parsed = parseYaml(raw) as Record; + const studio = parsed.studio as Record; + expect(studio.pass_threshold).toBeUndefined(); + expect(studio.threshold).toBe(0.7); }); it('creates config.yaml when it does not exist', () => { - saveStudioConfig(tempDir, { pass_threshold: 0.6 }); + saveStudioConfig(tempDir, { threshold: 0.6 }); const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; - expect((parsed.studio as Record).pass_threshold).toBe(0.6); + expect((parsed.studio as Record).threshold).toBe(0.6); }); it('creates directory if it does not exist', () => { const nestedDir = path.join(tempDir, 'nested', '.agentv'); - saveStudioConfig(nestedDir, { pass_threshold: 0.5 }); + saveStudioConfig(nestedDir, { threshold: 0.5 }); const raw = readFileSync(path.join(nestedDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; - expect((parsed.studio as Record).pass_threshold).toBe(0.5); + expect((parsed.studio as Record).threshold).toBe(0.5); }); }); diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 020901ee..e28279ce 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -120,7 +120,7 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) function StepsTab({ result }: { result: EvalResult }) { const { data: config } = useStudioConfig(); - const passThreshold = config?.pass_threshold ?? 0.8; + const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; const assertions = result.assertions ?? []; const hasFailed = !isPassing(result.score, passThreshold) || diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 6a9a25ee..baa5b852 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -86,7 +86,7 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate export function RunDetail({ results, runId, projectId }: RunDetailProps) { const { data: config } = useStudioConfig(); - const passThreshold = config?.pass_threshold ?? 0.8; + const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; const total = results.length; const passed = results.filter((r) => isPassing(r.score, passThreshold)).length; diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index fa9f56d8..461eb74e 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -188,7 +188,7 @@ function RunSidebar() { function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: string }) { const { data } = useRunDetail(runId); const { data: config } = useStudioConfig(); - const passThreshold = config?.pass_threshold ?? 0.8; + const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; return (