Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,11 @@ Before marking any branch as ready for review, complete this checklist:

4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).

5. **Mark PR as ready** only after steps 1-4 have been completed AND red/green UAT evidence is included in the PR.
5. **Live eval verification**: For changes affecting scoring, thresholds, or evaluator behavior, run at least one real eval with a live provider (not `--dry-run`) and verify the output JSONL has correct scores, verdicts, and execution status.

6. **Studio UX verification**: For changes affecting config, scoring display, or studio API, use `agent-browser` to verify the studio UI still renders and functions correctly (settings page loads, pass/fail indicators are correct, config saves work).

7. **Mark PR as ready** only after steps 1-6 have been completed AND red/green UAT evidence is included in the PR.

## Documentation Updates

Expand Down
8 changes: 3 additions & 5 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import path from 'node:path';

import type { EvaluationResult, EvaluatorResult } from '@agentv/core';
import { DEFAULT_THRESHOLD, type EvaluationResult, type EvaluatorResult } from '@agentv/core';
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
import { RESULT_INDEX_FILENAME } from './result-layout.js';

Expand Down Expand Up @@ -118,8 +118,6 @@ export type ResultIndexArtifact = IndexArtifactEntry;
// Statistics helpers
// ---------------------------------------------------------------------------

const PASS_THRESHOLD = 0.8;

function computeStats(values: readonly number[]): { mean: number; stddev: number } {
if (values.length === 0) {
return { mean: 0, stddev: 0 };
Expand All @@ -135,10 +133,10 @@ function computeStats(values: readonly number[]): { mean: number; stddev: number
function computePassRate(result: EvaluationResult): number {
const scores = result.scores;
if (scores && scores.length > 0) {
const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
return passed / scores.length;
}
return (result.score ?? 0) >= PASS_THRESHOLD ? 1.0 : 0.0;
return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1.0 : 0.0;
}

// ---------------------------------------------------------------------------
Expand Down
8 changes: 3 additions & 5 deletions apps/cli/src/commands/eval/benchmark-writer.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import { writeFile } from 'node:fs/promises';

import type { EvaluationResult } from '@agentv/core';

const PASS_THRESHOLD = 0.8;
import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core';

interface BenchmarkStats {
readonly mean: number;
Expand Down Expand Up @@ -43,10 +41,10 @@ function computeStats(values: readonly number[]): BenchmarkStats {
function computePassRate(result: EvaluationResult): number {
const scores = result.scores;
if (scores && scores.length > 0) {
const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
return passed / scores.length;
}
return result.score >= PASS_THRESHOLD ? 1.0 : 0.0;
return result.score >= DEFAULT_THRESHOLD ? 1.0 : 0.0;
}

/**
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ export function calculateEvaluationSummary(

// Count by execution status. When a custom threshold is provided,
// recompute passed/failed from raw scores instead of executionStatus
// (which uses the hardcoded PASS_THRESHOLD of 0.8).
// (which uses the hardcoded DEFAULT_THRESHOLD of 0.8).
const executionErrorCount = executionErrors.length;
const scoreThreshold = options?.threshold;
const passedCount =
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/src/commands/inspect/utils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { readFileSync, readdirSync, statSync } from 'node:fs';
import path from 'node:path';
import type { EvaluationResult, TraceSummary } from '@agentv/core';
import { PASS_THRESHOLD, toCamelCaseDeep } from '@agentv/core';
import { DEFAULT_THRESHOLD, toCamelCaseDeep } from '@agentv/core';
import {
RESULT_INDEX_FILENAME,
RESULT_RUNS_DIRNAME,
Expand Down Expand Up @@ -567,7 +567,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
const results = loadResultFile(filePath);

const testCount = results.length;
const passCount = results.filter((r) => r.score >= PASS_THRESHOLD).length;
const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
const passRate = testCount > 0 ? passCount / testCount : 0;
const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0;

Expand Down
14 changes: 7 additions & 7 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) {
if (!meta) return c.json({ error: 'Run not found' }, 404);
try {
const loaded = loadManifestResults(meta.path);
const { pass_threshold } = loadStudioConfig(agentvDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
const suiteMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
for (const r of loaded) {
const ds = r.suite ?? r.target ?? 'default';
Expand Down Expand Up @@ -311,7 +311,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) {
if (!meta) return c.json({ error: 'Run not found' }, 404);
try {
const loaded = loadManifestResults(meta.path);
const { pass_threshold } = loadStudioConfig(agentvDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
const categoryMap = new Map<
string,
{ total: number; passed: number; scoreSum: number; suites: Set<string> }
Expand Down Expand Up @@ -351,7 +351,7 @@ function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) {
if (!meta) return c.json({ error: 'Run not found' }, 404);
try {
const loaded = loadManifestResults(meta.path);
const { pass_threshold } = loadStudioConfig(agentvDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
const suiteMap = new Map<string, { total: number; passed: number; scoreSum: number }>();
for (const r of filtered) {
Expand Down Expand Up @@ -467,7 +467,7 @@ function handleEvalFileContent(c: C, { searchDir }: DataContext) {

function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) {
const metas = listResultFiles(searchDir);
const { pass_threshold } = loadStudioConfig(agentvDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
const experimentMap = new Map<
string,
{
Expand Down Expand Up @@ -520,7 +520,7 @@ function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) {

function handleTargets(c: C, { searchDir, agentvDir }: DataContext) {
const metas = listResultFiles(searchDir);
const { pass_threshold } = loadStudioConfig(agentvDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
const targetMap = new Map<
string,
{
Expand Down Expand Up @@ -615,8 +615,8 @@ export function createApp(
const body = await c.req.json<Partial<StudioConfig>>();
const current = loadStudioConfig(agentvDir);
const updated = { ...current, ...body };
if (typeof updated.pass_threshold === 'number') {
updated.pass_threshold = Math.min(1, Math.max(0, updated.pass_threshold));
if (typeof updated.threshold === 'number') {
updated.threshold = Math.min(1, Math.max(0, updated.threshold));
}
saveStudioConfig(agentvDir, updated);
return c.json(updated);
Expand Down
42 changes: 25 additions & 17 deletions apps/cli/src/commands/results/studio-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,34 @@
* config.yaml format:
* required_version: ">=4.2.0"
* studio:
* pass_threshold: 0.8 # score >= this value is considered "pass"
* threshold: 0.8 # score >= this value is considered "pass"
*
* Backward compat: reads root-level `pass_threshold` if `studio:` section
* is absent (legacy format). On save, always writes under `studio:`.
* Backward compat: reads `studio.pass_threshold` and root-level `pass_threshold`
* as fallback. On save, always writes `threshold` under `studio:`.
*
* If no config.yaml exists, defaults are used.
*/

import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
import path from 'node:path';

import { PASS_THRESHOLD } from '@agentv/core';
import { DEFAULT_THRESHOLD } from '@agentv/core';
import { parse as parseYaml, stringify as stringifyYaml } from 'yaml';

export interface StudioConfig {
pass_threshold: number;
threshold: number;
}

const DEFAULTS: StudioConfig = {
pass_threshold: PASS_THRESHOLD,
threshold: DEFAULT_THRESHOLD,
};

/**
* Load studio config from `config.yaml` in the given `.agentv/` directory.
* Reads from `studio.pass_threshold`, falling back to root-level
* `pass_threshold` for backward compatibility.
* Reads from `studio.threshold`, falling back to `studio.pass_threshold` (legacy),
* then root-level `pass_threshold` (legacy) for backward compatibility.
* Returns defaults when the file does not exist or is empty.
* Clamps `pass_threshold` to [0, 1].
* Clamps `threshold` to [0, 1].
*/
export function loadStudioConfig(agentvDir: string): StudioConfig {
const configPath = path.join(agentvDir, 'config.yaml');
Expand All @@ -53,20 +53,22 @@ export function loadStudioConfig(agentvDir: string): StudioConfig {
return { ...DEFAULTS };
}

// Prefer studio.pass_threshold, fall back to root-level pass_threshold (legacy)
// Prefer studio.threshold, fall back to studio.pass_threshold, then root-level pass_threshold
const studio = (parsed as Record<string, unknown>).studio;
let threshold = DEFAULTS.pass_threshold;
let threshold = DEFAULTS.threshold;
if (studio && typeof studio === 'object' && !Array.isArray(studio)) {
const studioThreshold = (studio as Record<string, unknown>).pass_threshold;
if (typeof studioThreshold === 'number') {
threshold = studioThreshold;
const studioObj = studio as Record<string, unknown>;
if (typeof studioObj.threshold === 'number') {
threshold = studioObj.threshold;
} else if (typeof studioObj.pass_threshold === 'number') {
threshold = studioObj.pass_threshold;
}
} else if (typeof (parsed as Record<string, unknown>).pass_threshold === 'number') {
threshold = (parsed as Record<string, unknown>).pass_threshold as number;
}

return {
pass_threshold: Math.min(1, Math.max(0, threshold)),
threshold: Math.min(1, Math.max(0, threshold)),
};
}

Expand Down Expand Up @@ -97,8 +99,14 @@ export function saveStudioConfig(agentvDir: string, config: StudioConfig): void
const { pass_threshold: _, ...rest } = existing;
existing = rest;

// Merge studio section
existing.studio = { ...config };
// Clean legacy pass_threshold from studio section if present
const existingStudio = existing.studio;
if (existingStudio && typeof existingStudio === 'object' && !Array.isArray(existingStudio)) {
const { pass_threshold: __, ...studioRest } = existingStudio as Record<string, unknown>;
existing.studio = { ...studioRest, ...config };
} else {
existing.studio = { ...config };
}

const yamlStr = stringifyYaml(existing);
writeFileSync(configPath, yamlStr, 'utf-8');
Expand Down
76 changes: 51 additions & 25 deletions apps/cli/test/commands/results/studio-config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import path from 'node:path';

import { PASS_THRESHOLD } from '@agentv/core';
import { DEFAULT_THRESHOLD } from '@agentv/core';
import { parse as parseYaml } from 'yaml';

import { loadStudioConfig, saveStudioConfig } from '../../../src/commands/results/studio-config.js';
Expand All @@ -21,52 +21,67 @@ describe('loadStudioConfig', () => {

it('returns defaults when no config.yaml exists', () => {
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(PASS_THRESHOLD);
expect(config.threshold).toBe(DEFAULT_THRESHOLD);
});

it('reads pass_threshold from studio section', () => {
it('reads threshold from studio section', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: 0.6\n');
const config = loadStudioConfig(tempDir);
expect(config.threshold).toBe(0.6);
});

it('reads pass_threshold from studio section as fallback (legacy)', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 0.6\n');
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(0.6);
expect(config.threshold).toBe(0.6);
});

it('prefers studio.threshold over studio.pass_threshold', () => {
writeFileSync(
path.join(tempDir, 'config.yaml'),
'studio:\n threshold: 0.9\n pass_threshold: 0.5\n',
);
const config = loadStudioConfig(tempDir);
expect(config.threshold).toBe(0.9);
});

it('falls back to root-level pass_threshold (legacy)', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 0.7\n');
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(0.7);
expect(config.threshold).toBe(0.7);
});

it('prefers studio section over root-level pass_threshold', () => {
writeFileSync(
path.join(tempDir, 'config.yaml'),
'pass_threshold: 0.5\nstudio:\n pass_threshold: 0.9\n',
'pass_threshold: 0.5\nstudio:\n threshold: 0.9\n',
);
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(0.9);
expect(config.threshold).toBe(0.9);
});

it('clamps pass_threshold to 0 when negative', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: -0.5\n');
it('clamps threshold to 0 when negative', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: -0.5\n');
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(0);
expect(config.threshold).toBe(0);
});

it('clamps pass_threshold to 1 when above 1', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 1.5\n');
it('clamps threshold to 1 when above 1', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: 1.5\n');
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(1);
expect(config.threshold).toBe(1);
});

it('returns defaults for empty config.yaml', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), '');
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(PASS_THRESHOLD);
expect(config.threshold).toBe(DEFAULT_THRESHOLD);
});

it('returns defaults when pass_threshold is not a number', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: "high"\n');
it('returns defaults when threshold is not a number', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: "high"\n');
const config = loadStudioConfig(tempDir);
expect(config.pass_threshold).toBe(PASS_THRESHOLD);
expect(config.threshold).toBe(DEFAULT_THRESHOLD);
});
});

Expand All @@ -86,43 +101,54 @@ describe('saveStudioConfig', () => {
path.join(tempDir, 'config.yaml'),
'required_version: ">=4.2.0"\neval_patterns:\n - "**/*.eval.yaml"\n',
);
saveStudioConfig(tempDir, { pass_threshold: 0.9 });
saveStudioConfig(tempDir, { threshold: 0.9 });

const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8');
const parsed = parseYaml(raw) as Record<string, unknown>;
expect(parsed.required_version).toBe('>=4.2.0');
expect(parsed.eval_patterns).toEqual(['**/*.eval.yaml']);
expect((parsed.studio as Record<string, unknown>).pass_threshold).toBe(0.9);
expect((parsed.studio as Record<string, unknown>).threshold).toBe(0.9);
});

it('removes legacy root-level pass_threshold on save', () => {
writeFileSync(
path.join(tempDir, 'config.yaml'),
'required_version: ">=4.2.0"\npass_threshold: 0.8\n',
);
saveStudioConfig(tempDir, { pass_threshold: 0.7 });
saveStudioConfig(tempDir, { threshold: 0.7 });

const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8');
const parsed = parseYaml(raw) as Record<string, unknown>;
expect(parsed.required_version).toBe('>=4.2.0');
expect(parsed.pass_threshold).toBeUndefined();
expect((parsed.studio as Record<string, unknown>).pass_threshold).toBe(0.7);
expect((parsed.studio as Record<string, unknown>).threshold).toBe(0.7);
});

it('removes legacy pass_threshold from studio section on save', () => {
writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 0.8\n');
saveStudioConfig(tempDir, { threshold: 0.7 });

const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8');
const parsed = parseYaml(raw) as Record<string, unknown>;
const studio = parsed.studio as Record<string, unknown>;
expect(studio.pass_threshold).toBeUndefined();
expect(studio.threshold).toBe(0.7);
});

it('creates config.yaml when it does not exist', () => {
saveStudioConfig(tempDir, { pass_threshold: 0.6 });
saveStudioConfig(tempDir, { threshold: 0.6 });

const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8');
const parsed = parseYaml(raw) as Record<string, unknown>;
expect((parsed.studio as Record<string, unknown>).pass_threshold).toBe(0.6);
expect((parsed.studio as Record<string, unknown>).threshold).toBe(0.6);
});

it('creates directory if it does not exist', () => {
const nestedDir = path.join(tempDir, 'nested', '.agentv');
saveStudioConfig(nestedDir, { pass_threshold: 0.5 });
saveStudioConfig(nestedDir, { threshold: 0.5 });

const raw = readFileSync(path.join(nestedDir, 'config.yaml'), 'utf-8');
const parsed = parseYaml(raw) as Record<string, unknown>;
expect((parsed.studio as Record<string, unknown>).pass_threshold).toBe(0.5);
expect((parsed.studio as Record<string, unknown>).threshold).toBe(0.5);
});
});
Loading
Loading