From f625c6429991a08b2fea79fe1bcd94046abd3ea8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 6 Apr 2026 03:47:25 +0000 Subject: [PATCH 1/8] =?UTF-8?q?refactor(core):=20rename=20PASS=5FTHRESHOLD?= =?UTF-8?q?=20=E2=86=92=20DEFAULT=5FTHRESHOLD=20and=20thread=20configurabl?= =?UTF-8?q?e=20threshold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename PASS_THRESHOLD to DEFAULT_THRESHOLD with deprecated alias - Add threshold parameter to scoreToVerdict() - Thread threshold through orchestrator: evaluateCandidate → runEvaluatorsForCase → runEvaluatorList - Wire per-test execution.threshold in yaml-parser - Fix evaluate() API: add EvalConfig.threshold, pass to computeSummary() - Fix required gate to use configurable threshold instead of hardcoded 0.8 - Replace local PASS_THRESHOLD constants in benchmark-writer and artifact-writer with import - Resolution order: CLI --threshold > test execution.threshold > suite execution.threshold > DEFAULT_THRESHOLD Closes #925 (partial) Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/artifact-writer.ts | 7 +++--- .../cli/src/commands/eval/benchmark-writer.ts | 8 +++--- apps/cli/src/commands/eval/statistics.ts | 2 +- apps/cli/src/commands/inspect/utils.ts | 4 +-- .../cli/src/commands/results/studio-config.ts | 4 +-- .../commands/results/studio-config.test.ts | 8 +++--- apps/studio/src/lib/api.ts | 2 +- packages/core/src/evaluation/evaluate.ts | 19 +++++++++----- .../core/src/evaluation/evaluators/index.ts | 1 + .../core/src/evaluation/evaluators/scoring.ts | 25 ++++++++++++------- packages/core/src/evaluation/orchestrator.ts | 21 ++++++++++------ packages/core/src/evaluation/types.ts | 2 ++ packages/core/src/evaluation/yaml-parser.ts | 7 ++++++ 13 files changed, 69 insertions(+), 41 deletions(-) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 03c2e901f..af44f2007 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -1,7 +1,7 @@ import { mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; -import type { EvaluationResult, EvaluatorResult } from '@agentv/core'; +import { DEFAULT_THRESHOLD, type EvaluationResult, type EvaluatorResult } from '@agentv/core'; import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; import { RESULT_INDEX_FILENAME } from './result-layout.js'; @@ -118,7 +118,6 @@ export type ResultIndexArtifact = IndexArtifactEntry; // Statistics helpers // --------------------------------------------------------------------------- -const PASS_THRESHOLD = 0.8; function computeStats(values: readonly number[]): { mean: number; stddev: number } { if (values.length === 0) { @@ -135,10 +134,10 @@ function computeStats(values: readonly number[]): { mean: number; stddev: number function computePassRate(result: EvaluationResult): number { const scores = result.scores; if (scores && scores.length > 0) { - const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length; + const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length; return passed / scores.length; } - return (result.score ?? 0) >= PASS_THRESHOLD ? 1.0 : 0.0; + return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1.0 : 0.0; } // --------------------------------------------------------------------------- diff --git a/apps/cli/src/commands/eval/benchmark-writer.ts b/apps/cli/src/commands/eval/benchmark-writer.ts index f1056b6a7..562dd8a87 100644 --- a/apps/cli/src/commands/eval/benchmark-writer.ts +++ b/apps/cli/src/commands/eval/benchmark-writer.ts @@ -1,8 +1,6 @@ import { writeFile } from 'node:fs/promises'; -import type { EvaluationResult } from '@agentv/core'; - -const PASS_THRESHOLD = 0.8; +import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core'; interface BenchmarkStats { readonly mean: number; @@ -43,10 +41,10 @@ function computeStats(values: readonly number[]): BenchmarkStats { function computePassRate(result: EvaluationResult): number { const scores = result.scores; if (scores && scores.length > 0) { - const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length; + const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length; return passed / scores.length; } - return result.score >= PASS_THRESHOLD ? 1.0 : 0.0; + return result.score >= DEFAULT_THRESHOLD ? 1.0 : 0.0; } /** diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 1e204b5c0..553d14878 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -135,7 +135,7 @@ export function calculateEvaluationSummary( // Count by execution status. When a custom threshold is provided, // recompute passed/failed from raw scores instead of executionStatus - // (which uses the hardcoded PASS_THRESHOLD of 0.8). + // (which uses the hardcoded DEFAULT_THRESHOLD of 0.8). const executionErrorCount = executionErrors.length; const scoreThreshold = options?.threshold; const passedCount = diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index f10a97ab4..01c6f6fc0 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -1,7 +1,7 @@ import { readFileSync, readdirSync, statSync } from 'node:fs'; import path from 'node:path'; import type { EvaluationResult, TraceSummary } from '@agentv/core'; -import { PASS_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; +import { DEFAULT_THRESHOLD, toCamelCaseDeep } from '@agentv/core'; import { RESULT_INDEX_FILENAME, RESULT_RUNS_DIRNAME, @@ -567,7 +567,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { const results = loadResultFile(filePath); const testCount = results.length; - const passCount = results.filter((r) => r.score >= PASS_THRESHOLD).length; + const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length; const passRate = testCount > 0 ? passCount / testCount : 0; const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0; diff --git a/apps/cli/src/commands/results/studio-config.ts b/apps/cli/src/commands/results/studio-config.ts index 14959a9e9..f922c7a28 100644 --- a/apps/cli/src/commands/results/studio-config.ts +++ b/apps/cli/src/commands/results/studio-config.ts @@ -21,7 +21,7 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; import path from 'node:path'; -import { PASS_THRESHOLD } from '@agentv/core'; +import { DEFAULT_THRESHOLD } from '@agentv/core'; import { parse as parseYaml, stringify as stringifyYaml } from 'yaml'; export interface StudioConfig { @@ -29,7 +29,7 @@ export interface StudioConfig { } const DEFAULTS: StudioConfig = { - pass_threshold: PASS_THRESHOLD, + pass_threshold: DEFAULT_THRESHOLD, }; /** diff --git a/apps/cli/test/commands/results/studio-config.test.ts b/apps/cli/test/commands/results/studio-config.test.ts index 4a8e46e84..ebd88842d 100644 --- a/apps/cli/test/commands/results/studio-config.test.ts +++ b/apps/cli/test/commands/results/studio-config.test.ts @@ -3,7 +3,7 @@ import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { PASS_THRESHOLD } from '@agentv/core'; +import { DEFAULT_THRESHOLD } from '@agentv/core'; import { parse as parseYaml } from 'yaml'; import { loadStudioConfig, saveStudioConfig } from '../../../src/commands/results/studio-config.js'; @@ -21,7 +21,7 @@ describe('loadStudioConfig', () => { it('returns defaults when no config.yaml exists', () => { const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(PASS_THRESHOLD); + expect(config.pass_threshold).toBe(DEFAULT_THRESHOLD); }); it('reads pass_threshold from studio section', () => { @@ -60,13 +60,13 @@ describe('loadStudioConfig', () => { it('returns defaults for empty config.yaml', () => { writeFileSync(path.join(tempDir, 'config.yaml'), ''); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(PASS_THRESHOLD); + expect(config.pass_threshold).toBe(DEFAULT_THRESHOLD); }); it('returns defaults when pass_threshold is not a number', () => { writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: "high"\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(PASS_THRESHOLD); + expect(config.pass_threshold).toBe(DEFAULT_THRESHOLD); }); }); diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 07f2883b0..267106f81 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -195,7 +195,7 @@ export function useStudioConfig() { return useQuery(studioConfigOptions); } -/** Default pass threshold matching @agentv/core PASS_THRESHOLD */ +/** Default pass threshold matching @agentv/core DEFAULT_THRESHOLD */ export const DEFAULT_PASS_THRESHOLD = 0.8; export function isPassing(score: number, passThreshold: number = DEFAULT_PASS_THRESHOLD): boolean { diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index d3bc52eb4..677a9b3d2 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -61,7 +61,7 @@ import path from 'node:path'; import { buildDirectoryChain, findGitRoot } from './file-utils.js'; import type { AssertFn } from './assertions.js'; -import { PASS_THRESHOLD } from './evaluators/scoring.js'; +import { DEFAULT_THRESHOLD } from './evaluators/scoring.js'; import { runEvaluation } from './orchestrator.js'; import { createFunctionProvider } from './providers/function-provider.js'; import { readTargetDefinitions } from './providers/targets-file.js'; @@ -158,6 +158,8 @@ export interface EvalConfig { readonly verbose?: boolean; /** Callback for each completed result */ readonly onResult?: (result: EvaluationResult) => void; + /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */ + readonly threshold?: number; } /** @@ -166,9 +168,9 @@ export interface EvalConfig { export interface EvalSummary { /** Total number of test cases */ readonly total: number; - /** Number of passing test cases (score >= PASS_THRESHOLD) */ + /** Number of passing test cases (score >= threshold) */ readonly passed: number; - /** Number of failing test cases (score < PASS_THRESHOLD) */ + /** Number of failing test cases (score < threshold) */ readonly failed: number; /** Total duration in milliseconds */ readonly durationMs: number; @@ -342,6 +344,7 @@ export async function evaluate(config: EvalConfig): Promise { verbose: config.verbose, maxConcurrency: config.workers ?? 3, filter: config.filter, + threshold: config.threshold, evalCases, onResult: async (result) => { collectedResults.push(result); @@ -354,7 +357,7 @@ export async function evaluate(config: EvalConfig): Promise { return { results: allResults, - summary: computeSummary(allResults, durationMs), + summary: computeSummary(allResults, durationMs, config.threshold), }; } @@ -369,14 +372,18 @@ function mapAssertionType(type: string): string { /** * Compute summary statistics from evaluation results. */ -function computeSummary(results: readonly EvaluationResult[], durationMs: number): EvalSummary { +function computeSummary( + results: readonly EvaluationResult[], + durationMs: number, + threshold = DEFAULT_THRESHOLD, +): EvalSummary { const total = results.length; let passed = 0; let scoreSum = 0; for (const r of results) { scoreSum += r.score; - if (r.score >= PASS_THRESHOLD) { + if (r.score >= threshold) { passed++; } } diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts index 8cfb216fd..c1d01106a 100644 --- a/packages/core/src/evaluation/evaluators/index.ts +++ b/packages/core/src/evaluation/evaluators/index.ts @@ -10,6 +10,7 @@ export type { // Scoring utilities export { + DEFAULT_THRESHOLD, PASS_THRESHOLD, clampScore, deepEqual, diff --git a/packages/core/src/evaluation/evaluators/scoring.ts b/packages/core/src/evaluation/evaluators/scoring.ts index ddb28d184..49f1611cf 100644 --- a/packages/core/src/evaluation/evaluators/scoring.ts +++ b/packages/core/src/evaluation/evaluators/scoring.ts @@ -3,24 +3,31 @@ * * Scoring model: * score ∈ [0, 1] — continuous quality signal - * verdict — binary classification derived from score via PASS_THRESHOLD + * verdict — binary classification derived from score via threshold * - * score >= PASS_THRESHOLD → 'pass' - * score < PASS_THRESHOLD → 'fail' + * score >= threshold → 'pass' + * score < threshold → 'fail' * (infrastructure skip) → 'skip' * - * To change the pass/fail boundary, update PASS_THRESHOLD. - * All verdict derivation flows through scoreToVerdict(). + * Scoring scale principle: + * All user-configurable score thresholds use 0-1 scale. + * The only 0-10 values in YAML are `score_ranges` which define LLM integer output band labels. + * + * Default threshold is 0.8. Override via CLI `--threshold`, suite `execution.threshold`, + * or per-test `execution.threshold`. All verdict derivation flows through scoreToVerdict(). */ import type { EvaluationVerdict } from '../types.js'; import type { EvaluationScore } from './types.js'; -/** Score threshold for pass verdict. Scores below this are fail. */ -export const PASS_THRESHOLD = 0.8; +/** Default score threshold for pass verdict (0-1). Scores below this are fail. */ +export const DEFAULT_THRESHOLD = 0.8; + +/** @deprecated Use DEFAULT_THRESHOLD instead. */ +export const PASS_THRESHOLD = DEFAULT_THRESHOLD; -export function scoreToVerdict(score: number): EvaluationVerdict { - return score >= PASS_THRESHOLD ? 'pass' : 'fail'; +export function scoreToVerdict(score: number, threshold = DEFAULT_THRESHOLD): EvaluationVerdict { + return score >= threshold ? 'pass' : 'fail'; } export function clampScore(value: number): number { diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index c58a0c5b8..206df7fe4 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -10,7 +10,7 @@ import { type EvaluationScore, type Evaluator, LlmGraderEvaluator, - PASS_THRESHOLD, + DEFAULT_THRESHOLD, negateScore, scoreToVerdict, } from './evaluators.js'; @@ -85,7 +85,7 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j type MaybePromise = T | Promise; -function classifyQualityStatus(score: number, threshold = PASS_THRESHOLD): ExecutionStatus { +function classifyQualityStatus(score: number, threshold = DEFAULT_THRESHOLD): ExecutionStatus { return score >= threshold ? 'ok' : 'quality_failure'; } @@ -1268,7 +1268,7 @@ async function runBatchEvaluation(options: { targetResolver, availableTargets, verbose, - threshold: batchThreshold, + threshold: evalCase.threshold ?? batchThreshold, }); if (providerError) { @@ -1806,9 +1806,10 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { const { evalCase, @@ -2223,6 +2226,7 @@ async function runEvaluatorsForCase(options: { availableTargets, fileChanges, workspacePath, + threshold, } = options; if (evalCase.assertions && evalCase.assertions.length > 0) { @@ -2250,6 +2254,7 @@ async function runEvaluatorsForCase(options: { availableTargets, fileChanges, workspacePath, + threshold, }); } @@ -2310,6 +2315,7 @@ async function runEvaluatorList(options: { readonly availableTargets?: readonly string[]; readonly fileChanges?: string; readonly workspacePath?: string; + readonly threshold?: number; }): Promise<{ score: EvaluationScore; scores: EvaluatorResult[] }> { const { evalCase, @@ -2469,9 +2475,10 @@ async function runEvaluatorList(options: { } // Required gate: if any evaluator with `required` flag fails its threshold, aggregate becomes 0 + const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD; const hasRequiredFailure = scored.some((entry) => { if (!entry.required) return false; - const minScore = typeof entry.required === 'number' ? entry.required : PASS_THRESHOLD; + const minScore = typeof entry.required === 'number' ? entry.required : effectiveThreshold; return entry.score.score < minScore; }); @@ -2489,7 +2496,7 @@ async function runEvaluatorList(options: { const score: EvaluationScore = { score: aggregateScore, - verdict: scoreToVerdict(aggregateScore), + verdict: scoreToVerdict(aggregateScore, effectiveThreshold), assertions, expectedAspectCount, }; diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 51d2841ae..47af2805f 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -788,6 +788,8 @@ export interface EvalTest { readonly metadata?: Record; /** Per-test target override (matrix evaluation) */ readonly targets?: readonly string[]; + /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */ + readonly threshold?: number; } /** @deprecated Use `EvalTest` instead */ diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 24f0c65bd..4e3a9f5af 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -357,6 +357,12 @@ async function loadTestsFromYaml( ? testCaseConfig.execution : undefined; const skipDefaults = caseExecution?.skip_defaults === true; + const caseThreshold = + typeof caseExecution?.threshold === 'number' && + (caseExecution.threshold as number) >= 0 && + (caseExecution.threshold as number) <= 1 + ? (caseExecution.threshold as number) + : undefined; // Resolve input with shorthand support (pass suite-level input_files for merge) const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : undefined; @@ -502,6 +508,7 @@ async function loadTestsFromYaml( workspace: mergedWorkspace, metadata, targets: caseTargets, + ...(caseThreshold !== undefined ? { threshold: caseThreshold } : {}), }; results.push(testCase); From 1337638442b9e43be2f9e494bd796dd986f023ca Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 6 Apr 2026 03:52:33 +0000 Subject: [PATCH 2/8] feat(core): add min_score field to assertion evaluators - Add min_score (0-1) to EvaluatorCommonSchema and all 22 evaluator config types - Parse min_score from YAML via parseRequiredAndMinScore helper - Deprecated: required: now emits warning, parsed as required: true + min_score - Orchestrator required gate reads min_score preferentially over required: number - Add min_score to EvalAssertionInput (programmatic API) Closes #925 (partial) Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/evaluation/evaluate.ts | 2 + .../evaluation/loaders/evaluator-parser.ts | 106 ++++++++++++++---- packages/core/src/evaluation/orchestrator.ts | 5 +- packages/core/src/evaluation/trace.ts | 2 + packages/core/src/evaluation/types.ts | 44 ++++++++ .../evaluation/validation/eval-file.schema.ts | 2 + 6 files changed, 138 insertions(+), 23 deletions(-) diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index 677a9b3d2..619bc90d8 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -112,6 +112,8 @@ export interface EvalAssertionInput { readonly weight?: number; /** Whether this assertion is required to pass */ readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** Prompt file for llm_grader */ readonly prompt?: string; /** Script for code_grader */ diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 2cecc40e9..2564dd4e8 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -179,9 +179,9 @@ async function parseEvaluatorList( // Custom assertion types — store with their type name for registry dispatch if (isCustomType) { const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); // Collect all properties except known meta-keys as pass-through config - const knownProps = new Set(['name', 'type', 'weight', 'required', 'negate']); + const knownProps = new Set(['name', 'type', 'weight', 'required', 'min_score', 'negate']); const config: Record = {}; for (const [key, value] of Object.entries(rawEvaluator)) { if (!knownProps.has(key) && value !== undefined) { @@ -193,6 +193,7 @@ async function parseEvaluatorList( type: customTypeName as unknown as EvaluatorKind, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), ...(Object.keys(config).length > 0 ? { config } : {}), } as EvaluatorConfig); @@ -275,7 +276,7 @@ async function parseEvaluatorList( } } - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); // Collect unrecognized properties as pass-through config const knownProps = new Set([ @@ -304,6 +305,7 @@ async function parseEvaluatorList( resolvedCwd, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), ...(Object.keys(config).length > 0 ? { config } : {}), ...(targetConfig !== undefined ? { target: targetConfig } : {}), @@ -471,7 +473,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, @@ -480,6 +482,7 @@ async function parseEvaluatorList( aggregator, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -628,7 +631,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); const config: ToolTrajectoryEvaluatorConfig = { name, @@ -638,6 +641,7 @@ async function parseEvaluatorList( ...(expected ? { expected } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), ...(argsMatch !== undefined ? { argsMatch } : {}), }; @@ -714,7 +718,7 @@ async function parseEvaluatorList( const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : undefined; const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, @@ -723,6 +727,7 @@ async function parseEvaluatorList( ...(validAggregation ? { aggregation: validAggregation } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -738,7 +743,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, @@ -746,6 +751,7 @@ async function parseEvaluatorList( threshold, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -761,7 +767,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, @@ -769,6 +775,7 @@ async function parseEvaluatorList( budget, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -810,7 +817,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, @@ -818,6 +825,7 @@ async function parseEvaluatorList( ...validLimits, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -889,7 +897,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, @@ -897,6 +905,7 @@ async function parseEvaluatorList( ...validThresholds, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -911,7 +920,7 @@ async function parseEvaluatorList( const rawShouldTrigger = rawEvaluator.should_trigger; const shouldTrigger = typeof rawShouldTrigger === 'boolean' ? rawShouldTrigger : undefined; const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: 'skill-trigger', @@ -919,6 +928,7 @@ async function parseEvaluatorList( ...(shouldTrigger !== undefined ? { should_trigger: shouldTrigger } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -931,13 +941,14 @@ async function parseEvaluatorList( continue; } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: 'contains', value, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -952,13 +963,14 @@ async function parseEvaluatorList( continue; } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: typeValue, value, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), } as import('../types.js').EvaluatorConfig); continue; @@ -971,13 +983,14 @@ async function parseEvaluatorList( continue; } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: 'icontains', value, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), } as import('../types.js').EvaluatorConfig); continue; @@ -992,13 +1005,14 @@ async function parseEvaluatorList( continue; } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: typeValue, value, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), } as import('../types.js').EvaluatorConfig); continue; @@ -1011,13 +1025,14 @@ async function parseEvaluatorList( continue; } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: typeValue, value, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), } as import('../types.js').EvaluatorConfig); continue; @@ -1031,7 +1046,7 @@ async function parseEvaluatorList( } const flags = asString(rawEvaluator.flags); const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: 'regex', @@ -1039,6 +1054,7 @@ async function parseEvaluatorList( ...(flags !== undefined ? { flags } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -1046,12 +1062,13 @@ async function parseEvaluatorList( if (typeValue === 'is-json') { const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: 'is-json', ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -1064,13 +1081,14 @@ async function parseEvaluatorList( continue; } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, type: 'equals', value, ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -1112,7 +1130,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); evaluators.push({ name, @@ -1121,6 +1139,7 @@ async function parseEvaluatorList( ...(graderTargetName ? { target: graderTargetName } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; @@ -1215,7 +1234,7 @@ async function parseEvaluatorList( } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); // deprecated: `type: rubric` maps to `type: llm-grader` with `rubrics`. Use `type: rubrics` with `criteria` instead. evaluators.push({ @@ -1225,13 +1244,14 @@ async function parseEvaluatorList( ...(graderTargetName ? { target: graderTargetName } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), }); continue; } const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); + const { required, min_score } = parseRequiredAndMinScore(rawEvaluator.required, (rawEvaluator as Record).min_score as JsonValue | undefined, name, evalId); // Collect unrecognized properties as pass-through config (for text prompt templates) // Note: For script prompts, config comes from prompt.config instead @@ -1245,6 +1265,7 @@ async function parseEvaluatorList( 'weight', 'config', 'required', + 'min_score', 'negate', 'max_steps', 'maxSteps', @@ -1291,6 +1312,7 @@ async function parseEvaluatorList( ...(graderTargetName ? { target: graderTargetName } : {}), ...(weight !== undefined ? { weight } : {}), ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), ...(negate !== undefined ? { negate } : {}), ...(finalConfig ? { config: finalConfig } : {}), ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}), @@ -1472,6 +1494,46 @@ function parseRequired(value: JsonValue | undefined): boolean | number | undefin return undefined; } +/** + * Parse `required` and `min_score` from raw evaluator config, handling deprecated `required: number`. + * + * - `required: true` → `{ required: true }` + * - `required: 0.7` (deprecated) → `{ required: true, min_score: 0.7 }` + deprecation warning + * - `min_score: 0.7` → `{ min_score: 0.7 }` + * - Explicit `min_score` takes priority over `required: number` + */ +function parseRequiredAndMinScore( + rawRequired: JsonValue | undefined, + rawMinScore: JsonValue | undefined, + evaluatorName: string, + evalId: string, +): { required?: boolean | number; min_score?: number } { + const result: { required?: boolean | number; min_score?: number } = {}; + + // Parse min_score (explicit field, takes priority) + if (typeof rawMinScore === 'number' && rawMinScore > 0 && rawMinScore <= 1) { + result.min_score = rawMinScore; + } + + // Parse required + if (rawRequired === true) { + result.required = true; + } else if (typeof rawRequired === 'number' && rawRequired > 0 && rawRequired <= 1) { + // Deprecated: required: number → required: true + min_score + if (result.min_score === undefined) { + result.min_score = rawRequired; + } + // Keep numeric required for backward compat (orchestrator reads min_score preferentially) + result.required = rawRequired; + logWarning( + `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. ` + + `Use 'required: true' + 'min_score: ${rawRequired}' instead.`, + ); + } + + return result; +} + /** * Validate and extract weight from evaluator config. * Throws if weight is invalid (negative, NaN, or Infinity). diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 206df7fe4..92a112aed 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2349,6 +2349,7 @@ async function runEvaluatorList(options: { readonly type: string; readonly weight?: number; readonly required?: boolean | number; + readonly min_score?: number; }> = []; const scores: EvaluatorResult[] = []; @@ -2403,6 +2404,7 @@ async function runEvaluatorList(options: { type: evaluatorConfig.type, weight, ...(evaluatorConfig.required !== undefined ? { required: evaluatorConfig.required } : {}), + ...(evaluatorConfig.min_score !== undefined ? { min_score: evaluatorConfig.min_score } : {}), }); scores.push({ name: evaluatorConfig.name, @@ -2438,6 +2440,7 @@ async function runEvaluatorList(options: { type: evaluatorConfig.type ?? 'llm-grader', weight, ...(evaluatorConfig.required !== undefined ? { required: evaluatorConfig.required } : {}), + ...(evaluatorConfig.min_score !== undefined ? { min_score: evaluatorConfig.min_score } : {}), }); scores.push({ name: evaluatorConfig.name ?? 'unknown', @@ -2478,7 +2481,7 @@ async function runEvaluatorList(options: { const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD; const hasRequiredFailure = scored.some((entry) => { if (!entry.required) return false; - const minScore = typeof entry.required === 'number' ? entry.required : effectiveThreshold; + const minScore = entry.min_score ?? (typeof entry.required === 'number' ? entry.required : effectiveThreshold); return entry.score.score < minScore; }); diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index d5128737b..65d0aedaf 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -71,6 +71,8 @@ export interface ToolTrajectoryEvaluatorConfig { /** Optional weight for top-level aggregation (defaults to 1.0) */ readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; /** Default argument matching mode for all expected items (defaults to 'exact') */ diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 47af2805f..ab0152ef9 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -303,6 +303,8 @@ export type CodeEvaluatorConfig = { readonly resolvedCwd?: string; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */ @@ -337,6 +339,8 @@ export type LlmGraderEvaluatorConfig = { readonly rubrics?: readonly RubricItem[]; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; /** Optional target override for this grader (uses a named LLM target from targets.yaml). */ @@ -413,6 +417,8 @@ export type CompositeEvaluatorConfig = { readonly aggregator: CompositeAggregatorConfig; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -461,6 +467,8 @@ export type FieldAccuracyEvaluatorConfig = { readonly aggregation?: FieldAggregationType; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -476,6 +484,8 @@ export type LatencyEvaluatorConfig = { readonly threshold: number; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -491,6 +501,8 @@ export type CostEvaluatorConfig = { readonly budget: number; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -510,6 +522,8 @@ export type TokenUsageEvaluatorConfig = { readonly max_output?: number; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -538,6 +552,8 @@ export type ExecutionMetricsEvaluatorConfig = { readonly exploration_tolerance?: number; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -552,6 +568,8 @@ export type ContainsEvaluatorConfig = { readonly value: string; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -566,6 +584,8 @@ export type ContainsAnyEvaluatorConfig = { readonly value: readonly string[]; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -580,6 +600,8 @@ export type ContainsAllEvaluatorConfig = { readonly value: readonly string[]; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -594,6 +616,8 @@ export type IcontainsEvaluatorConfig = { readonly value: string; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -608,6 +632,8 @@ export type IcontainsAnyEvaluatorConfig = { readonly value: readonly string[]; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -622,6 +648,8 @@ export type IcontainsAllEvaluatorConfig = { readonly value: readonly string[]; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -636,6 +664,8 @@ export type StartsWithEvaluatorConfig = { readonly value: string; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -650,6 +680,8 @@ export type EndsWithEvaluatorConfig = { readonly value: string; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -666,6 +698,8 @@ export type RegexEvaluatorConfig = { readonly flags?: string; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -679,6 +713,8 @@ export type IsJsonEvaluatorConfig = { readonly type: 'is-json'; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -693,6 +729,8 @@ export type EqualsEvaluatorConfig = { readonly value: string; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -707,6 +745,8 @@ export type RubricsEvaluatorConfig = { readonly criteria: readonly RubricItem[]; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ readonly negate?: boolean; }; @@ -726,6 +766,8 @@ export type SkillTriggerEvaluatorConfig = { readonly should_trigger?: boolean; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; readonly negate?: boolean; }; @@ -738,6 +780,8 @@ export type InlineAssertEvaluatorConfig = { readonly type: 'inline-assert'; readonly weight?: number; readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; readonly negate?: boolean; }; diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 9a0686cca..152348983 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -40,6 +40,8 @@ const EvaluatorCommonSchema = z.object({ name: z.string().optional(), weight: z.number().min(0).optional(), required: z.union([z.boolean(), z.number().gt(0).lte(1)]).optional(), + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + min_score: z.number().gt(0).lte(1).optional(), negate: z.boolean().optional(), }); From de10d7f0b4fbf3b0e4ceb264720b3defd71b3285 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 6 Apr 2026 03:55:25 +0000 Subject: [PATCH 3/8] feat(core): add min_score (0-1 scale) to rubrics, deprecate required_min_score (0-10) - Add min_score to RubricItemSchema and RubricItem type (0-1 scale) - Parser: min_score takes priority over deprecated required_min_score - Parser: required_min_score emits deprecation warning, converts to min_score - LLM grader: compare normalized score against min_score (avoids rounding issues) - Prompt generation: prefer min_score for criterion labels - Inline rubrics: support both min_score and required_min_score Closes #925 (partial) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/evaluation/evaluators/llm-grader.ts | 31 ++++++----- .../evaluation/loaders/evaluator-parser.ts | 53 +++++++++++++++---- packages/core/src/evaluation/types.ts | 12 +++-- .../evaluation/validation/eval-file.schema.ts | 3 ++ 4 files changed, 72 insertions(+), 27 deletions(-) diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts index 55f906ff8..d53599358 100644 --- a/packages/core/src/evaluation/evaluators/llm-grader.ts +++ b/packages/core/src/evaluation/evaluators/llm-grader.ts @@ -878,9 +878,11 @@ export class LlmGraderEvaluator implements Evaluator { for (const rubric of rubrics) { const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; const minScoreLabel = - rubric.required_min_score !== undefined - ? ` [REQUIRED: min score ${rubric.required_min_score}]` - : ''; + rubric.min_score !== undefined + ? ` [REQUIRED: min score ${rubric.min_score}]` + : rubric.required_min_score !== undefined + ? ` [REQUIRED: min score ${rubric.required_min_score}]` + : ''; parts.push('', `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`); @@ -1285,15 +1287,18 @@ function calculateScoreRangeResult( totalWeight += rubric.weight; weightedScoreSum += normalizedScore * rubric.weight; - // Determine required minimum score: - // - If required_min_score is set, use it directly - // - If required is true (legacy), treat as required_min_score: 10 + // Determine required minimum score (as normalized 0-1): + // - If min_score is set (0-1), use directly + // - If required_min_score is set (legacy 0-10), normalize to 0-1 + // - If required is true (legacy), treat as min_score: 1.0 // - Otherwise, no gating - let requiredMinScore: number | undefined; - if (rubric.required_min_score !== undefined) { - requiredMinScore = rubric.required_min_score; + let minScoreThreshold: number | undefined; + if (rubric.min_score !== undefined) { + minScoreThreshold = rubric.min_score; + } else if (rubric.required_min_score !== undefined) { + minScoreThreshold = rubric.required_min_score / 10; } else if (rubric.required === true) { - requiredMinScore = 10; // Legacy: required: true means must score 10/10 + minScoreThreshold = 1.0; // Legacy: required: true means must score 10/10 } // Find the matching score range description for reporting @@ -1303,10 +1308,10 @@ function calculateScoreRangeResult( const rangeDescription = matchingRange?.outcome ?? ''; const criterionLabel = rubric.outcome ?? rubric.id; - // Check gating + // Check gating — compare normalized score against min_score threshold (both 0-1) const passed = - !(requiredMinScore !== undefined && rawScore < requiredMinScore) && rawScore >= 7; - if (requiredMinScore !== undefined && rawScore < requiredMinScore) { + !(minScoreThreshold !== undefined && normalizedScore < minScoreThreshold) && rawScore >= 7; + if (minScoreThreshold !== undefined && normalizedScore < minScoreThreshold) { failedRequired = true; } diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 2564dd4e8..79723a614 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -1606,18 +1606,36 @@ function parseRubricItems( const expectedOutcome = asString(rawRubric.outcome) ?? ''; const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0; - // Parse required_min_score (new) or required (legacy backward compat) + // Parse min_score (0-1 scale), required_min_score (deprecated 0-10 scale), and required + let minScore: number | undefined; let requiredMinScore: number | undefined; let required: boolean | undefined; - if (typeof rawRubric.required_min_score === 'number') { - const minScore = rawRubric.required_min_score; - if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) { + if (typeof rawRubric.min_score === 'number') { + // New field: 0-1 scale + const ms = rawRubric.min_score as number; + if (ms <= 0 || ms > 1) { throw new Error( - `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`, + `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`, ); } - requiredMinScore = minScore; + minScore = ms; + // Compute legacy required_min_score for backward compat with llm-grader internals + requiredMinScore = Math.round(ms * 10); + } else if (typeof rawRubric.required_min_score === 'number') { + // Deprecated: 0-10 integer scale + const rms = rawRubric.required_min_score as number; + if (!Number.isInteger(rms) || rms < 0 || rms > 10) { + throw new Error( + `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`, + ); + } + requiredMinScore = rms; + minScore = rms / 10; + logWarning( + `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. ` + + `Use 'min_score: ${rms / 10}' (0-1 scale) instead.`, + ); } if (typeof rawRubric.required === 'boolean') { @@ -1644,6 +1662,7 @@ function parseRubricItems( weight, ...(expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {}), ...(required !== undefined ? { required } : {}), + ...(minScore !== undefined ? { min_score: minScore } : {}), ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}), score_ranges: scoreRanges, }); @@ -1662,6 +1681,7 @@ function parseRubricItems( weight, // Default to required: true if not specified (backward compatibility) required: required ?? true, + ...(minScore !== undefined ? { min_score: minScore } : {}), ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}), }); } @@ -1883,14 +1903,26 @@ export function parseInlineRubrics( weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, }; + // Parse min_score (0-1) or required_min_score (deprecated 0-10) + let inlineMinScore: number | undefined; + let inlineRequiredMinScore: number | undefined; + if (typeof rubric.min_score === 'number') { + inlineMinScore = rubric.min_score as number; + inlineRequiredMinScore = Math.round(inlineMinScore * 10); + } else if (typeof rubric.required_min_score === 'number') { + inlineRequiredMinScore = rubric.required_min_score as number; + inlineMinScore = inlineRequiredMinScore / 10; + } + // For score_ranges rubrics, outcome at rubric level is optional if (scoreRanges && scoreRanges.length > 0) { return { ...baseRubric, ...(expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {}), ...(typeof rubric.required === 'boolean' ? { required: rubric.required } : {}), - ...(typeof rubric.required_min_score === 'number' - ? { required_min_score: rubric.required_min_score } + ...(inlineMinScore !== undefined ? { min_score: inlineMinScore } : {}), + ...(inlineRequiredMinScore !== undefined + ? { required_min_score: inlineRequiredMinScore } : {}), score_ranges: scoreRanges, }; @@ -1901,8 +1933,9 @@ export function parseInlineRubrics( ...baseRubric, outcome: expectedOutcome, required: typeof rubric.required === 'boolean' ? rubric.required : true, - ...(typeof rubric.required_min_score === 'number' - ? { required_min_score: rubric.required_min_score } + ...(inlineMinScore !== undefined ? { min_score: inlineMinScore } : {}), + ...(inlineRequiredMinScore !== undefined + ? { required_min_score: inlineRequiredMinScore } : {}), }; }) diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index ab0152ef9..fb4a83ba7 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -382,13 +382,17 @@ export type RubricItem = { readonly outcome?: string; readonly weight: number; /** - * Legacy boolean gating (deprecated, treated as required_min_score: 10). - * Use required_min_score instead for finer control. + * Legacy boolean gating (treated as min_score: 1.0 for score-range rubrics). */ readonly required?: boolean; /** - * Minimum score (0-10) required to pass this criterion. - * If the criterion score is below this threshold, the overall verdict is 'fail'. + * Minimum score (0-1 scale) required to pass this criterion. + * Internally compared against normalized score (rawScore / 10). + */ + readonly min_score?: number; + /** + * @deprecated Use min_score (0-1 scale) instead. + * Legacy: minimum score on 0-10 integer scale. */ readonly required_min_score?: number; /** diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 152348983..5f866305a 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -67,6 +67,9 @@ const RubricItemSchema = z.object({ outcome: z.string().optional(), weight: z.number().optional(), required: z.boolean().optional(), + /** Minimum score (0-1) for this criterion to pass. Replaces required_min_score (0-10). */ + min_score: z.number().gt(0).lte(1).optional(), + /** @deprecated Use min_score (0-1 scale) instead. Legacy: 0-10 integer scale. */ required_min_score: z.number().int().min(0).max(10).optional(), score_ranges: z.array(ScoreRangeSchema).optional(), }); From 8d6d13c2e77b01c196d8d1ec34fc4cc26156e3aa Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 6 Apr 2026 03:58:47 +0000 Subject: [PATCH 4/8] =?UTF-8?q?feat(studio):=20rename=20pass=5Fthreshold?= =?UTF-8?q?=20=E2=86=92=20threshold=20in=20studio=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - StudioConfig.pass_threshold → StudioConfig.threshold - loadStudioConfig: reads studio.threshold, falls back to studio.pass_threshold, then root pass_threshold - saveStudioConfig: writes threshold, cleans legacy pass_threshold from both root and studio - StudioConfigResponse: add threshold field, deprecate pass_threshold - Update all studio components to read threshold with pass_threshold fallback - serve.ts: destructure as { threshold: pass_threshold } for minimal component diff Closes #925 (partial) Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/results/serve.ts | 14 ++-- .../cli/src/commands/results/studio-config.ts | 40 ++++++---- .../commands/results/studio-config.test.ts | 77 +++++++++++++------ apps/studio/src/components/EvalDetail.tsx | 2 +- apps/studio/src/components/RunDetail.tsx | 2 +- apps/studio/src/components/Sidebar.tsx | 6 +- apps/studio/src/lib/types.ts | 4 +- .../src/routes/runs/$runId_.suite.$suite.tsx | 2 +- apps/studio/src/routes/settings.tsx | 4 +- 9 files changed, 95 insertions(+), 56 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 2ef89726d..6bd2cc503 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -282,7 +282,7 @@ function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) { if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const suiteMap = new Map(); for (const r of loaded) { const ds = r.suite ?? r.target ?? 'default'; @@ -311,7 +311,7 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const categoryMap = new Map< string, { total: number; passed: number; scoreSum: number; suites: Set } @@ -351,7 +351,7 @@ function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) { if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); const suiteMap = new Map(); for (const r of filtered) { @@ -467,7 +467,7 @@ function handleEvalFileContent(c: C, { searchDir }: DataContext) { function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { const metas = listResultFiles(searchDir); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const experimentMap = new Map< string, { @@ -520,7 +520,7 @@ function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { const metas = listResultFiles(searchDir); - const { pass_threshold } = loadStudioConfig(agentvDir); + const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const targetMap = new Map< string, { @@ -615,8 +615,8 @@ export function createApp( const body = await c.req.json>(); const current = loadStudioConfig(agentvDir); const updated = { ...current, ...body }; - if (typeof updated.pass_threshold === 'number') { - updated.pass_threshold = Math.min(1, Math.max(0, updated.pass_threshold)); + if (typeof updated.threshold === 'number') { + updated.threshold = Math.min(1, Math.max(0, updated.threshold)); } saveStudioConfig(agentvDir, updated); return c.json(updated); diff --git a/apps/cli/src/commands/results/studio-config.ts b/apps/cli/src/commands/results/studio-config.ts index f922c7a28..4be3c56a5 100644 --- a/apps/cli/src/commands/results/studio-config.ts +++ b/apps/cli/src/commands/results/studio-config.ts @@ -10,10 +10,10 @@ * config.yaml format: * required_version: ">=4.2.0" * studio: - * pass_threshold: 0.8 # score >= this value is considered "pass" + * threshold: 0.8 # score >= this value is considered "pass" * - * Backward compat: reads root-level `pass_threshold` if `studio:` section - * is absent (legacy format). On save, always writes under `studio:`. + * Backward compat: reads `studio.pass_threshold` and root-level `pass_threshold` + * as fallback. On save, always writes `threshold` under `studio:`. * * If no config.yaml exists, defaults are used. */ @@ -25,19 +25,19 @@ import { DEFAULT_THRESHOLD } from '@agentv/core'; import { parse as parseYaml, stringify as stringifyYaml } from 'yaml'; export interface StudioConfig { - pass_threshold: number; + threshold: number; } const DEFAULTS: StudioConfig = { - pass_threshold: DEFAULT_THRESHOLD, + threshold: DEFAULT_THRESHOLD, }; /** * Load studio config from `config.yaml` in the given `.agentv/` directory. - * Reads from `studio.pass_threshold`, falling back to root-level - * `pass_threshold` for backward compatibility. + * Reads from `studio.threshold`, falling back to `studio.pass_threshold` (legacy), + * then root-level `pass_threshold` (legacy) for backward compatibility. * Returns defaults when the file does not exist or is empty. - * Clamps `pass_threshold` to [0, 1]. + * Clamps `threshold` to [0, 1]. */ export function loadStudioConfig(agentvDir: string): StudioConfig { const configPath = path.join(agentvDir, 'config.yaml'); @@ -53,20 +53,22 @@ export function loadStudioConfig(agentvDir: string): StudioConfig { return { ...DEFAULTS }; } - // Prefer studio.pass_threshold, fall back to root-level pass_threshold (legacy) + // Prefer studio.threshold, fall back to studio.pass_threshold, then root-level pass_threshold const studio = (parsed as Record).studio; - let threshold = DEFAULTS.pass_threshold; + let threshold = DEFAULTS.threshold; if (studio && typeof studio === 'object' && !Array.isArray(studio)) { - const studioThreshold = (studio as Record).pass_threshold; - if (typeof studioThreshold === 'number') { - threshold = studioThreshold; + const studioObj = studio as Record; + if (typeof studioObj.threshold === 'number') { + threshold = studioObj.threshold; + } else if (typeof studioObj.pass_threshold === 'number') { + threshold = studioObj.pass_threshold; } } else if (typeof (parsed as Record).pass_threshold === 'number') { threshold = (parsed as Record).pass_threshold as number; } return { - pass_threshold: Math.min(1, Math.max(0, threshold)), + threshold: Math.min(1, Math.max(0, threshold)), }; } @@ -97,8 +99,14 @@ export function saveStudioConfig(agentvDir: string, config: StudioConfig): void const { pass_threshold: _, ...rest } = existing; existing = rest; - // Merge studio section - existing.studio = { ...config }; + // Clean legacy pass_threshold from studio section if present + const existingStudio = existing.studio; + if (existingStudio && typeof existingStudio === 'object' && !Array.isArray(existingStudio)) { + const { pass_threshold: __, ...studioRest } = existingStudio as Record; + existing.studio = { ...studioRest, ...config }; + } else { + existing.studio = { ...config }; + } const yamlStr = stringifyYaml(existing); writeFileSync(configPath, yamlStr, 'utf-8'); diff --git a/apps/cli/test/commands/results/studio-config.test.ts b/apps/cli/test/commands/results/studio-config.test.ts index ebd88842d..572a35829 100644 --- a/apps/cli/test/commands/results/studio-config.test.ts +++ b/apps/cli/test/commands/results/studio-config.test.ts @@ -21,52 +21,67 @@ describe('loadStudioConfig', () => { it('returns defaults when no config.yaml exists', () => { const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(DEFAULT_THRESHOLD); + expect(config.threshold).toBe(DEFAULT_THRESHOLD); }); - it('reads pass_threshold from studio section', () => { + it('reads threshold from studio section', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: 0.6\n'); + const config = loadStudioConfig(tempDir); + expect(config.threshold).toBe(0.6); + }); + + it('reads pass_threshold from studio section as fallback (legacy)', () => { writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 0.6\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0.6); + expect(config.threshold).toBe(0.6); + }); + + it('prefers studio.threshold over studio.pass_threshold', () => { + writeFileSync( + path.join(tempDir, 'config.yaml'), + 'studio:\n threshold: 0.9\n pass_threshold: 0.5\n', + ); + const config = loadStudioConfig(tempDir); + expect(config.threshold).toBe(0.9); }); it('falls back to root-level pass_threshold (legacy)', () => { writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 0.7\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0.7); + expect(config.threshold).toBe(0.7); }); it('prefers studio section over root-level pass_threshold', () => { writeFileSync( path.join(tempDir, 'config.yaml'), - 'pass_threshold: 0.5\nstudio:\n pass_threshold: 0.9\n', + 'pass_threshold: 0.5\nstudio:\n threshold: 0.9\n', ); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0.9); + expect(config.threshold).toBe(0.9); }); - it('clamps pass_threshold to 0 when negative', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: -0.5\n'); + it('clamps threshold to 0 when negative', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: -0.5\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(0); + expect(config.threshold).toBe(0); }); - it('clamps pass_threshold to 1 when above 1', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 1.5\n'); + it('clamps threshold to 1 when above 1', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: 1.5\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(1); + expect(config.threshold).toBe(1); }); it('returns defaults for empty config.yaml', () => { writeFileSync(path.join(tempDir, 'config.yaml'), ''); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(DEFAULT_THRESHOLD); + expect(config.threshold).toBe(DEFAULT_THRESHOLD); }); - it('returns defaults when pass_threshold is not a number', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: "high"\n'); + it('returns defaults when threshold is not a number', () => { + writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: "high"\n'); const config = loadStudioConfig(tempDir); - expect(config.pass_threshold).toBe(DEFAULT_THRESHOLD); + expect(config.threshold).toBe(DEFAULT_THRESHOLD); }); }); @@ -86,13 +101,13 @@ describe('saveStudioConfig', () => { path.join(tempDir, 'config.yaml'), 'required_version: ">=4.2.0"\neval_patterns:\n - "**/*.eval.yaml"\n', ); - saveStudioConfig(tempDir, { pass_threshold: 0.9 }); + saveStudioConfig(tempDir, { threshold: 0.9 }); const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; expect(parsed.required_version).toBe('>=4.2.0'); expect(parsed.eval_patterns).toEqual(['**/*.eval.yaml']); - expect((parsed.studio as Record).pass_threshold).toBe(0.9); + expect((parsed.studio as Record).threshold).toBe(0.9); }); it('removes legacy root-level pass_threshold on save', () => { @@ -100,29 +115,43 @@ describe('saveStudioConfig', () => { path.join(tempDir, 'config.yaml'), 'required_version: ">=4.2.0"\npass_threshold: 0.8\n', ); - saveStudioConfig(tempDir, { pass_threshold: 0.7 }); + saveStudioConfig(tempDir, { threshold: 0.7 }); const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; expect(parsed.required_version).toBe('>=4.2.0'); expect(parsed.pass_threshold).toBeUndefined(); - expect((parsed.studio as Record).pass_threshold).toBe(0.7); + expect((parsed.studio as Record).threshold).toBe(0.7); + }); + + it('removes legacy pass_threshold from studio section on save', () => { + writeFileSync( + path.join(tempDir, 'config.yaml'), + 'studio:\n pass_threshold: 0.8\n', + ); + saveStudioConfig(tempDir, { threshold: 0.7 }); + + const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); + const parsed = parseYaml(raw) as Record; + const studio = parsed.studio as Record; + expect(studio.pass_threshold).toBeUndefined(); + expect(studio.threshold).toBe(0.7); }); it('creates config.yaml when it does not exist', () => { - saveStudioConfig(tempDir, { pass_threshold: 0.6 }); + saveStudioConfig(tempDir, { threshold: 0.6 }); const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; - expect((parsed.studio as Record).pass_threshold).toBe(0.6); + expect((parsed.studio as Record).threshold).toBe(0.6); }); it('creates directory if it does not exist', () => { const nestedDir = path.join(tempDir, 'nested', '.agentv'); - saveStudioConfig(nestedDir, { pass_threshold: 0.5 }); + saveStudioConfig(nestedDir, { threshold: 0.5 }); const raw = readFileSync(path.join(nestedDir, 'config.yaml'), 'utf-8'); const parsed = parseYaml(raw) as Record; - expect((parsed.studio as Record).pass_threshold).toBe(0.5); + expect((parsed.studio as Record).threshold).toBe(0.5); }); }); diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 020901eed..e28279cec 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -120,7 +120,7 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) function StepsTab({ result }: { result: EvalResult }) { const { data: config } = useStudioConfig(); - const passThreshold = config?.pass_threshold ?? 0.8; + const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; const assertions = result.assertions ?? []; const hasFailed = !isPassing(result.score, passThreshold) || diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 6a9a25ee4..baa5b8526 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -86,7 +86,7 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate export function RunDetail({ results, runId, projectId }: RunDetailProps) { const { data: config } = useStudioConfig(); - const passThreshold = config?.pass_threshold ?? 0.8; + const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; const total = results.length; const passed = results.filter((r) => isPassing(r.score, passThreshold)).length; diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index fa9f56d8b..461eb74ed 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -188,7 +188,7 @@ function RunSidebar() { function EvalSidebar({ runId, currentEvalId }: { runId: string; currentEvalId: string }) { const { data } = useRunDetail(runId); const { data: config } = useStudioConfig(); - const passThreshold = config?.pass_threshold ?? 0.8; + const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; return (