From a9dc26a9cda1cfd4a95b36ff727c113f32d219eb Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Fri, 17 Apr 2026 16:38:04 +0200 Subject: [PATCH] feat(eval): add edit-preflight discovery lane --- scripts/benchmark-comparators.mjs | 465 +++++++++++++++--- scripts/lib/managed-mcp-session.mjs | 2 +- scripts/run-eval.mjs | 88 +++- src/eval/edit-preflight-harness.ts | 266 ++++++++++ src/eval/run-config.ts | 37 ++ src/eval/types.ts | 96 ++++ tests/benchmark-comparators.test.ts | 99 ++++ tests/edit-preflight-harness.test.ts | 243 +++++++++ tests/fixtures/README.md | 32 +- .../edit-preflight-angular-spotify.json | 93 ++++ tests/fixtures/edit-preflight-excalidraw.json | 93 ++++ tests/run-eval-config.test.ts | 25 + 12 files changed, 1459 insertions(+), 80 deletions(-) create mode 100644 src/eval/edit-preflight-harness.ts create mode 100644 src/eval/run-config.ts create mode 100644 tests/edit-preflight-harness.test.ts create mode 100644 tests/fixtures/edit-preflight-angular-spotify.json create mode 100644 tests/fixtures/edit-preflight-excalidraw.json create mode 100644 tests/run-eval-config.test.ts diff --git a/scripts/benchmark-comparators.mjs b/scripts/benchmark-comparators.mjs index 0fbe5ca..e0a8e37 100644 --- a/scripts/benchmark-comparators.mjs +++ b/scripts/benchmark-comparators.mjs @@ -1,4 +1,3 @@ -#!/usr/bin/env node /** * Automated comparator benchmark runner for codebase-context discovery benchmark. * @@ -12,7 +11,7 @@ */ import path from 'path'; -import { fileURLToPath } from 'url'; +import { fileURLToPath, pathToFileURL } from 'url'; import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs'; import { execSync, execFile } from 'child_process'; import { parseArgs } from 'util'; @@ -51,6 +50,224 @@ function normalizeText(value) { return value.toLowerCase().replace(/\\/g, '/'); } +function normalizeRelativePath(candidate) { + if (typeof candidate !== 'string') return null; + const trimmed = candidate.trim().replace(/^["']|["']$/g, ''); + if (!trimmed) return null; + const normalized = trimmed.replace(/\\/g, '/').replace(/^\.\//, ''); + if (/^[A-Za-z]:\//.test(normalized)) { + return normalized.replace(/^[A-Za-z]:\//, ''); + } + return normalized; +} + +function normalizeFilesystemPath(candidate) { + if (typeof candidate !== 'string') return null; + return candidate.trim().replace(/\\/g, '/').replace(/\/+$/, '').toLowerCase(); +} + +function isLikelyCodePath(candidate) { + if (typeof candidate !== 'string') return false; + if (!candidate.includes('/')) return false; + const lastSegment = candidate.split('/').pop() ?? ''; + return /\.[A-Za-z0-9]+$/.test(lastSegment); +} + +function collectTopFiles(value, sink = []) { + if (Array.isArray(value)) { + for (const item of value) { + collectTopFiles(item, sink); + } + return sink; + } + + if (value && typeof value === 'object') { + for (const [key, nested] of Object.entries(value)) { + if ( + (key === 'file' || key === 'filePath' || key === 'path' || key === 'source') && + typeof nested === 'string' + ) { + const normalized = normalizeRelativePath(nested); + if (normalized && isLikelyCodePath(normalized) && !sink.includes(normalized)) { + sink.push(normalized); + } + } + collectTopFiles(nested, sink); + } + return sink; + } + + if (typeof value === 'string') { + const matches = value.match(/[A-Za-z0-9_.-]+(?:\/[A-Za-z0-9_.-]+)+\.[A-Za-z0-9]+/g) ?? []; + for (const match of matches) { + const normalized = normalizeRelativePath(match); + if (normalized && !sink.includes(normalized)) { + sink.push(normalized); + } + } + } + + return sink; +} + +function extractBestExample(value) { + if (!value || typeof value !== 'object') return null; + if (Array.isArray(value)) { + for (const item of value) { + const candidate = extractBestExample(item); + if (candidate) return candidate; + } + return null; + } + + for (const [key, nested] of Object.entries(value)) { + if ( + (key === 'bestExample' || key === 'best_example' || key === 'goldenFile' || key === 'example') && + typeof nested === 'string' + ) { + const normalized = normalizeRelativePath(nested); + if (normalized) return normalized; + } + const candidate = extractBestExample(nested); + if (candidate) return candidate; + } + + return null; +} + +function extractPayloadText(result) { + const parts = []; + if (Array.isArray(result?.content)) { + for (const item of result.content) { + if (typeof item?.text === 'string' && item.text.trim()) { + parts.push(item.text.trim()); + } + } + } + if (result?.structuredContent !== undefined) { + parts.push(JSON.stringify(result.structuredContent, null, 2)); + } + if (parts.length === 0) { + parts.push(JSON.stringify(result)); + } + return parts.join('\n'); +} + +function extractMcpResponse(result) { + const topFiles = collectTopFiles(result?.structuredContent ?? result); + const bestExample = extractBestExample(result?.structuredContent ?? result) ?? topFiles[0] ?? null; + return { + payload: extractPayloadText(result), + ...(topFiles.length > 0 && { topFiles }), + ...(bestExample && { bestExample }) + }; +} + +function parseToolTextPayload(result) { + const textParts = Array.isArray(result?.content) + ? result.content + .map((item) => (typeof item?.text === 'string' ? item.text.trim() : '')) + .filter(Boolean) + : []; + return textParts.join('\n'); +} + +function extractIndexedProjectName(listProjectsResult, rootPath) { + const payload = parseToolTextPayload(listProjectsResult); + if (!payload) return null; + + try { + const parsed = JSON.parse(payload); + const projects = Array.isArray(parsed.projects) ? parsed.projects : []; + const normalizedRootPath = normalizeFilesystemPath(rootPath); + const match = projects.find( + (project) => normalizeFilesystemPath(project.root_path) === normalizedRootPath + ); + return typeof match?.name === 'string' ? match.name : null; + } catch { + return null; + } +} + +function matchPatterns(candidates, patterns) { + if (!patterns || patterns.length === 0) return null; + const normalizedPatterns = patterns.map(normalizeText); + for (let index = 0; index < candidates.length; index++) { + const normalizedCandidate = normalizeText(candidates[index]); + if (normalizedPatterns.some((pattern) => normalizedCandidate.includes(pattern))) { + return index + 1; + } + } + return null; +} + +export function buildRawClaudePrompt(task, rootPath) { + const query = task.args?.query ?? task.prompt; + const intent = + task.surface === 'search_codebase' + ? 'search' + : task.surface === 'get_team_patterns' + ? 'find local conventions' + : 'map/orient to the repository'; + + return [ + `You are exploring a codebase at ${path.resolve(rootPath)}.`, + `Use only Read, Grep, and Glob tools to ${intent}.`, + `Question: ${query}`, + 'Return strict JSON with this shape:', + '{"answer":"short concrete answer with repo terms","files":["repo-relative path in relevance order"],"bestExample":"repo-relative path or null"}', + 'Rules:', + '- files must be repo-relative and ordered most relevant first', + '- answer must include concrete identifiers, files, or patterns from the repo, not generic advice', + '- bestExample must be the strongest local example if one exists, otherwise null', + '- Output JSON only' + ].join('\n'); +} + +export function parseRawClaudeStructuredResult(resultText) { + const topFiles = []; + let bestExample = null; + let payload = resultText; + const trimmed = typeof resultText === 'string' ? resultText.trim() : ''; + const fencedJsonMatch = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i); + const candidateJson = fencedJsonMatch ? fencedJsonMatch[1].trim() : trimmed; + + try { + const parsed = JSON.parse(candidateJson); + if (parsed && typeof parsed === 'object') { + if (Array.isArray(parsed.files)) { + for (const file of parsed.files) { + const normalized = normalizeRelativePath(file); + if (normalized && isLikelyCodePath(normalized) && !topFiles.includes(normalized)) { + topFiles.push(normalized); + } + } + } + const normalizedBestExample = normalizeRelativePath(parsed.bestExample); + if (normalizedBestExample) { + bestExample = normalizedBestExample; + } else if (topFiles.length > 0) { + bestExample = topFiles[0]; + } + payload = JSON.stringify(parsed); + } + } catch { + const fallbackFiles = collectTopFiles(resultText); + for (const file of fallbackFiles) { + if (!topFiles.includes(file)) { + topFiles.push(file); + } + } + bestExample = topFiles[0] ?? null; + } + + return { + payload, + ...(topFiles.length > 0 && { topFiles }), + ...(bestExample && { bestExample }) + }; +} + function matchSignals(payload, expectedSignals, forbiddenSignals) { const normalizedPayload = normalizeText(payload); const matchedSignals = expectedSignals.filter((s) => @@ -124,16 +341,26 @@ const COMPARATOR_ADAPTERS = [ serverArgs: ['--yes', 'codebase-memory-mcp'], serverEnv: {}, initTimeout: 10000, + resolveProjectName: true, indexTool: null, // auto-indexes on first query - searchTool: 'search_code', - searchArgs(task) { - return { query: task.prompt, mode: 'compact' }; - }, - extractPayload(result) { - if (Array.isArray(result?.content)) { - return result.content.map((c) => (typeof c?.text === 'string' ? c.text : JSON.stringify(c))).join('\n'); + buildTaskCall(task, { projectName }) { + const query = task.args?.query ?? task.prompt; + if (task.job === 'map') { + return { + name: 'get_architecture', + arguments: { project: projectName } + }; } - return JSON.stringify(result); + + return { + name: 'search_graph', + arguments: { + project: projectName, + query, + include_connected: true, + limit: 10 + } + }; } }, { @@ -170,12 +397,7 @@ const COMPARATOR_ADAPTERS = [ detail_level: 'compact' }; }, - extractPayload(result) { - if (Array.isArray(result?.content)) { - return result.content.map((c) => (typeof c?.text === 'string' ? c.text : JSON.stringify(c))).join('\n'); - } - return JSON.stringify(result); - } + extractPayload: null }, { name: 'GrepAI', @@ -208,12 +430,7 @@ const COMPARATOR_ADAPTERS = [ searchArgs(task) { return { query: task.prompt }; }, - extractPayload(result) { - if (Array.isArray(result?.content)) { - return result.content.map((c) => (typeof c?.text === 'string' ? c.text : JSON.stringify(c))).join('\n'); - } - return JSON.stringify(result); - } + extractPayload: null }, { name: 'CodeGraphContext', @@ -249,12 +466,7 @@ const COMPARATOR_ADAPTERS = [ // CodeGraphContext uses cypher-based queries; approximate with a search tool return { query: task.prompt }; }, - extractPayload(result) { - if (Array.isArray(result?.content)) { - return result.content.map((c) => (typeof c?.text === 'string' ? c.text : JSON.stringify(c))).join('\n'); - } - return JSON.stringify(result); - } + extractPayload: null }, { name: 'raw Claude Code', @@ -281,9 +493,7 @@ const COMPARATOR_ADAPTERS = [ searchArgs(task) { return { prompt: task.prompt }; }, - extractPayload(result) { - return typeof result === 'string' ? result : JSON.stringify(result); - } + extractPayload: null } ]; @@ -297,6 +507,7 @@ async function runComparatorViaMcp(adapter, rootPath, tasks) { serverCommand: adapter.serverCommand, serverArgs: adapter.serverArgs, serverEnv: adapter.serverEnv, + cwd: path.resolve(rootPath), connectTimeoutMs: adapter.connectTimeout ?? 15_000 }, async ({ client }) => { @@ -312,6 +523,25 @@ async function runComparatorViaMcp(adapter, rootPath, tasks) { throw new Error(`Failed to list tools from ${adapter.name}: ${err.message}`); } + let projectName = null; + if (adapter.resolveProjectName && availableTools.some((tool) => tool.name === 'list_projects')) { + try { + const listProjectsResult = await client.callTool({ + name: 'list_projects', + arguments: {} + }); + projectName = extractIndexedProjectName(listProjectsResult, rootPath); + } catch (err) { + throw new Error(`Failed to resolve indexed project for ${adapter.name}: ${err.message}`); + } + + if (!projectName) { + throw new Error( + `Could not resolve indexed project for ${adapter.name} at ${path.resolve(rootPath)}` + ); + } + } + const toolNames = availableTools.map((t) => t.name); let searchToolName = adapter.searchTool; if (!searchToolName) { @@ -348,15 +578,33 @@ async function runComparatorViaMcp(adapter, rootPath, tasks) { for (const task of tasks) { const startMs = Date.now(); let payload = ''; + let topFiles = []; + let bestExample = null; let toolCallCount = totalToolCalls; try { - const result = await client.callTool({ - name: searchToolName, - arguments: adapter.searchArgs(task) - }); + const request = + typeof adapter.buildTaskCall === 'function' + ? adapter.buildTaskCall(task, { rootPath, projectName, toolNames }) + : { + name: searchToolName, + arguments: adapter.searchArgs(task) + }; + const result = await client.callTool(request); toolCallCount++; - payload = adapter.extractPayload(result); + const extracted = + typeof adapter.extractPayload === 'function' + ? adapter.extractPayload(result) + : extractMcpResponse(result); + payload = typeof extracted === 'string' ? extracted : extracted.payload; + topFiles = + extracted && typeof extracted === 'object' && Array.isArray(extracted.topFiles) + ? extracted.topFiles + : []; + bestExample = + extracted && typeof extracted === 'object' && typeof extracted.bestExample === 'string' + ? extracted.bestExample + : topFiles[0] ?? null; } catch (err) { console.warn(` [${adapter.name}] Task ${task.id} failed: ${err.message}`); payload = ''; @@ -370,6 +618,13 @@ async function runComparatorViaMcp(adapter, rootPath, tasks) { task.expectedSignals, task.forbiddenSignals ); + const firstRelevantHit = matchPatterns(topFiles, task.expectedFilePatterns); + const bestExampleUseful = + task.expectedBestExamplePatterns && task.expectedBestExamplePatterns.length > 0 + ? task.expectedBestExamplePatterns.some((pattern) => + normalizeText(bestExample ?? '').includes(normalizeText(pattern)) + ) + : undefined; taskResults.push({ taskId: task.id, @@ -381,7 +636,9 @@ async function runComparatorViaMcp(adapter, rootPath, tasks) { payloadBytes, estimatedTokens, toolCallCount, - elapsedMs + elapsedMs, + ...(firstRelevantHit !== null ? { firstRelevantHit } : {}), + ...(typeof bestExampleUseful === 'boolean' ? { bestExampleUseful } : {}) }); } @@ -406,25 +663,76 @@ async function runRawClaudeCode(rootPath, tasks) { for (const task of tasks) { const startMs = Date.now(); let payload = ''; + let topFiles = []; + let bestExample = null; try { - const prompt = `You are exploring a codebase at ${path.resolve(rootPath)}. Answer this question using only grep, glob, and read file operations: ${task.prompt}`; - const { stdout } = await execFileAsync( - 'claude', - ['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'], - { timeout: 120000, cwd: path.resolve(rootPath), shell: process.platform === 'win32' } - ); + const prompt = buildRawClaudePrompt(task, rootPath); + const commandArgs = + process.platform === 'win32' + ? [ + 'powershell.exe', + [ + '-NoProfile', + '-Command', + 'claude -p $env:CLAUDE_BENCHMARK_PROMPT --model haiku --effort low --output-format json --allowedTools Read,Grep,Glob' + ], + { + timeout: 120000, + cwd: path.resolve(rootPath), + windowsHide: true, + env: { + ...process.env, + CLAUDE_BENCHMARK_PROMPT: prompt + } + } + ] + : [ + 'claude', + ['-p', prompt, '--model', 'haiku', '--effort', 'low', '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'], + { + timeout: 120000, + cwd: path.resolve(rootPath), + windowsHide: true + } + ]; + const { stdout } = await execFileAsync(commandArgs[0], commandArgs[1], commandArgs[2]); try { const parsed = JSON.parse(stdout); - payload = parsed.result ?? stdout; + const extracted = parseRawClaudeStructuredResult(parsed.result ?? stdout); + payload = extracted.payload; + topFiles = extracted.topFiles ?? []; + bestExample = extracted.bestExample ?? null; } catch { - payload = stdout; + const extracted = parseRawClaudeStructuredResult(stdout); + payload = extracted.payload; + topFiles = extracted.topFiles ?? []; + bestExample = extracted.bestExample ?? null; } } catch (err) { if (err.code === 'ENOENT' || err.message?.includes('command not found')) { throw new Error('claude CLI not found'); } - console.warn(` [raw Claude Code] Task ${task.id} error: ${err.message}`); + const fallbackStdout = typeof err.stdout === 'string' ? err.stdout.trim() : ''; + if (fallbackStdout) { + try { + const parsed = JSON.parse(fallbackStdout); + const extracted = parseRawClaudeStructuredResult(parsed.result ?? fallbackStdout); + payload = extracted.payload; + topFiles = extracted.topFiles ?? []; + bestExample = extracted.bestExample ?? null; + } catch { + const extracted = parseRawClaudeStructuredResult(fallbackStdout); + payload = extracted.payload; + topFiles = extracted.topFiles ?? []; + bestExample = extracted.bestExample ?? null; + } + } + + if (!payload) { + const stderr = typeof err.stderr === 'string' ? err.stderr.trim() : ''; + console.warn(` [raw Claude Code] Task ${task.id} error: ${stderr || err.message}`); + } } const elapsedMs = Date.now() - startMs; @@ -435,6 +743,13 @@ async function runRawClaudeCode(rootPath, tasks) { task.expectedSignals, task.forbiddenSignals ); + const firstRelevantHit = matchPatterns(topFiles, task.expectedFilePatterns); + const bestExampleUseful = + task.expectedBestExamplePatterns && task.expectedBestExamplePatterns.length > 0 + ? task.expectedBestExamplePatterns.some((pattern) => + normalizeText(bestExample ?? '').includes(normalizeText(pattern)) + ) + : undefined; taskResults.push({ taskId: task.id, @@ -446,7 +761,9 @@ async function runRawClaudeCode(rootPath, tasks) { payloadBytes, estimatedTokens, toolCallCount: null, - elapsedMs + elapsedMs, + ...(firstRelevantHit !== null ? { firstRelevantHit } : {}), + ...(typeof bestExampleUseful === 'boolean' ? { bestExampleUseful } : {}) }); } @@ -457,26 +774,56 @@ async function runRawClaudeCode(rootPath, tasks) { // Aggregate task results into DiscoveryComparatorMetrics shape // --------------------------------------------------------------------------- -function aggregateResults(taskResults) { +export function aggregateResults(taskResults) { const n = taskResults.length; - if (n === 0) return { averageUsefulness: null, averagePayloadBytes: null, averageEstimatedTokens: null, averageFirstRelevantHit: null, bestExampleUsefulnessRate: null }; + if (n === 0) { + return { + averageUsefulness: null, + averagePayloadBytes: null, + averageEstimatedTokens: null, + averageFirstRelevantHit: null, + bestExampleUsefulnessRate: null, + status: 'pending_evidence', + reason: 'No comparator task results were produced' + }; + } const avgUsefulness = taskResults.reduce((s, r) => s + r.usefulnessScore, 0) / n; const avgBytes = taskResults.reduce((s, r) => s + r.payloadBytes, 0) / n; const avgTokens = taskResults.reduce((s, r) => s + r.estimatedTokens, 0) / n; + const searchHits = taskResults + .map((r) => r.firstRelevantHit) + .filter((value) => typeof value === 'number'); + const bestExampleResults = taskResults + .map((r) => r.bestExampleUseful) + .filter((value) => typeof value === 'boolean'); const toolCallCounts = taskResults.map((r) => r.toolCallCount).filter((v) => typeof v === 'number'); const elapsedMsList = taskResults.map((r) => r.elapsedMs).filter((v) => typeof v === 'number'); + const hasMeaningfulEvidence = taskResults.some( + (result) => + result.usefulnessScore > 0 || + typeof result.firstRelevantHit === 'number' || + result.bestExampleUseful === true + ); + const status = hasMeaningfulEvidence ? 'ok' : 'pending_evidence'; return { averageUsefulness: avgUsefulness, averagePayloadBytes: avgBytes, averageEstimatedTokens: avgTokens, - averageFirstRelevantHit: null, // comparators don't expose ranked file lists in standard MCP responses - bestExampleUsefulnessRate: null, + averageFirstRelevantHit: + searchHits.length > 0 ? searchHits.reduce((sum, value) => sum + value, 0) / searchHits.length : null, + bestExampleUsefulnessRate: + bestExampleResults.length > 0 + ? bestExampleResults.filter(Boolean).length / bestExampleResults.length + : null, averageToolCallCount: toolCallCounts.length > 0 ? toolCallCounts.reduce((s, v) => s + v, 0) / toolCallCounts.length : null, averageElapsedMs: elapsedMsList.length > 0 ? elapsedMsList.reduce((s, v) => s + v, 0) / elapsedMsList.length : null, - status: 'ok', + status, + ...(status === 'pending_evidence' + ? { reason: 'Comparator returned task payloads, but none contained usable benchmark evidence' } + : {}), taskResults }; } @@ -680,7 +1027,13 @@ async function main() { } } -main().catch((err) => { - console.error('Fatal:', err); - process.exit(2); -}); +const isMain = + process.argv[1] && + import.meta.url === pathToFileURL(path.resolve(process.argv[1])).href; + +if (isMain) { + main().catch((err) => { + console.error('Fatal:', err); + process.exit(2); + }); +} diff --git a/scripts/lib/managed-mcp-session.mjs b/scripts/lib/managed-mcp-session.mjs index 5f54e55..97c8106 100644 --- a/scripts/lib/managed-mcp-session.mjs +++ b/scripts/lib/managed-mcp-session.mjs @@ -2,7 +2,7 @@ import process from 'node:process'; async function loadSdkClient() { const [{ Client }, { StdioClientTransport }] = await Promise.all([ - import('@modelcontextprotocol/sdk/client/index.js'), + import('@modelcontextprotocol/sdk/client'), import('@modelcontextprotocol/sdk/client/stdio.js') ]); diff --git a/scripts/run-eval.mjs b/scripts/run-eval.mjs index ef82204..6f5b706 100644 --- a/scripts/run-eval.mjs +++ b/scripts/run-eval.mjs @@ -11,12 +11,18 @@ import { analyzerRegistry } from '../dist/core/analyzer-registry.js'; import { AngularAnalyzer } from '../dist/analyzers/angular/index.js'; import { GenericAnalyzer } from '../dist/analyzers/generic/index.js'; import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js'; +import { + combineEditPreflightSummaries, + evaluateEditPreflightFixture, + formatEditPreflightReport +} from '../dist/eval/edit-preflight-harness.js'; import { combineDiscoverySummaries, evaluateDiscoveryGate, evaluateDiscoveryFixture, formatDiscoveryReport } from '../dist/eval/discovery-harness.js'; +import { getDefaultFixturePaths, resolveEvalMode } from '../dist/eval/run-config.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const projectRoot = path.join(__dirname, '..'); @@ -24,20 +30,6 @@ const packageJsonPath = path.join(projectRoot, 'package.json'); const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8')); -const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json'); -const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json'); -const defaultDiscoveryFixtureA = path.join( - projectRoot, - 'tests', - 'fixtures', - 'discovery-angular-spotify.json' -); -const defaultDiscoveryFixtureB = path.join( - projectRoot, - 'tests', - 'fixtures', - 'discovery-excalidraw.json' -); const defaultDiscoveryProtocol = path.join( projectRoot, 'tests', @@ -49,7 +41,7 @@ const usage = [ `Usage: node scripts/run-eval.mjs [codebaseB] [options]`, ``, `Options:`, - ` --mode= Select benchmark mode (default: retrieval)`, + ` --mode= Select benchmark mode (default: retrieval)`, ` --fixture-a= Override fixture for codebaseA`, ` --fixture-b= Override fixture for codebaseB`, ` --protocol= Override discovery benchmark protocol`, @@ -151,6 +143,17 @@ async function runSingleEvaluation({ fixturePath: resolvedFixture, summary }); + } else if (mode === 'edit-preflight') { + console.log(`\n--- Phase 2: Running ${fixture.tasks.length}-task edit-preflight harness ---`); + summary = await evaluateEditPreflightFixture({ + fixture, + rootPath: resolvedCodebase + }); + report = formatEditPreflightReport({ + codebaseLabel: label, + fixturePath: resolvedFixture, + summary + }); } else { console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`); const searcher = new CodebaseSearcher(resolvedCodebase); @@ -202,6 +205,31 @@ function printCombinedSummary(summaries, mode) { return; } + if (mode === 'edit-preflight') { + const combined = combineEditPreflightSummaries(summaries); + console.log(`\n=== Combined Edit Preflight Summary ===`); + console.log( + `Top-target in top-3: ${combined.topTargetInTop3Count}/${combined.targetableTasks} (${combined.topTargetInTop3Rate === null ? 'n/a' : (combined.topTargetInTop3Rate * 100).toFixed(0) + '%'})` + ); + console.log( + `Average first relevant hit: ${combined.averageFirstRelevantHit === null ? 'n/a' : combined.averageFirstRelevantHit.toFixed(2)}` + ); + console.log( + `Best-example hit rate: ${combined.bestExampleHitCount}/${combined.bestExampleTasks} (${combined.bestExampleHitRate === null ? 'n/a' : (combined.bestExampleHitRate * 100).toFixed(0) + '%'})` + ); + console.log( + `Safe ready rate: ${combined.safeTaskReadyCount}/${combined.safeTasks} (${combined.safeTaskReadyRate === null ? 'n/a' : (combined.safeTaskReadyRate * 100).toFixed(0) + '%'})` + ); + console.log( + `Unsafe abstain rate: ${combined.unsafeTaskAbstainCount}/${combined.unsafeTasks} (${combined.unsafeTaskAbstainRate === null ? 'n/a' : (combined.unsafeTaskAbstainRate * 100).toFixed(0) + '%'})` + ); + console.log( + `Unsafe ready=true false positives: ${combined.unsafeReadyFalsePositiveCount}/${combined.unsafeTasks} (${combined.unsafeReadyFalsePositiveRate === null ? 'n/a' : (combined.unsafeReadyFalsePositiveRate * 100).toFixed(0) + '%'})` + ); + console.log(`=======================================\n`); + return; + } + const total = summaries.reduce((sum, summary) => sum + summary.total, 0); const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0); const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0); @@ -254,17 +282,14 @@ async function main() { const codebaseA = positionals[0]; const codebaseB = positionals[1]; - const mode = values.mode === 'discovery' ? 'discovery' : 'retrieval'; + const mode = resolveEvalMode(values.mode); + const defaultFixtures = getDefaultFixturePaths(projectRoot, mode); const fixtureA = values['fixture-a'] ? path.resolve(values['fixture-a']) - : mode === 'discovery' - ? defaultDiscoveryFixtureA - : defaultFixtureA; + : defaultFixtures.fixtureA; const fixtureB = values['fixture-b'] ? path.resolve(values['fixture-b']) - : mode === 'discovery' - ? defaultDiscoveryFixtureB - : defaultFixtureB; + : defaultFixtures.fixtureB; const protocolPath = values.protocol ? path.resolve(values.protocol) : defaultDiscoveryProtocol; @@ -326,6 +351,25 @@ async function main() { process.exit(gate.status === 'failed' ? 1 : 0); } + if (mode === 'edit-preflight') { + const combinedSummary = combineEditPreflightSummaries(summaries); + printCombinedSummary(summaries, mode); + console.log( + formatEditPreflightReport({ + codebaseLabel: 'combined-suite', + fixturePath: codebaseB ? `${fixtureA}, ${fixtureB}` : fixtureA, + summary: combinedSummary + }) + ); + if (outputPath) { + const outputDir = path.dirname(outputPath); + if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true }); + writeFileSync(outputPath, JSON.stringify(combinedSummary, null, 2)); + console.log(`\nResults written to: ${outputPath}`); + } + process.exit(0); + } + if (outputPath && mode === 'discovery' && summaries.length === 1) { const outputDir = path.dirname(outputPath); if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true }); diff --git a/src/eval/edit-preflight-harness.ts b/src/eval/edit-preflight-harness.ts new file mode 100644 index 0000000..4f34588 --- /dev/null +++ b/src/eval/edit-preflight-harness.ts @@ -0,0 +1,266 @@ +import { createProjectState } from '../project-state.js'; +import { handle as searchCodebaseHandle } from '../tools/search-codebase.js'; +import type { + EditPreflightFixture, + EditPreflightResponse, + EditPreflightRunner, + EditPreflightSummary, + EditPreflightTask, + EditPreflightTaskResult, + EvaluateEditPreflightFixtureParams, + FormatEditPreflightReportParams +} from './types.js'; + +function normalizeText(value: string): string { + return value.toLowerCase().replace(/\\/g, '/'); +} + +function stripLocationSuffix(fileRef: string): string { + return fileRef.replace(/:(\d+)(?:-\d+)?$/, ''); +} + +function matchesPatterns(candidate: string, patterns: string[] | undefined): boolean { + if (!patterns || patterns.length === 0) { + return false; + } + + const normalizedCandidate = normalizeText(candidate); + return patterns.some((pattern) => normalizedCandidate.includes(normalizeText(pattern))); +} + +function findFirstRelevantHit(topFiles: string[], patterns: string[] | undefined): number | null { + if (!patterns || patterns.length === 0) { + return null; + } + + for (let index = 0; index < topFiles.length; index++) { + if (matchesPatterns(topFiles[index], patterns)) { + return index + 1; + } + } + + return null; +} + +function summarizeEditPreflightResults(results: EditPreflightTaskResult[]): EditPreflightSummary { + const totalTasks = results.length; + const safeResults = results.filter((result) => result.risk === 'safe'); + const unsafeResults = results.filter((result) => result.risk === 'unsafe'); + const targetableResults = results.filter((result) => result.topTargetInTop3 !== null); + const bestExampleResults = results.filter((result) => result.bestExampleHit !== null); + const firstRelevantHits = results + .map((result) => result.firstRelevantHit) + .filter((value): value is number => typeof value === 'number'); + + const topTargetInTop3Count = targetableResults.filter((result) => result.topTargetInTop3).length; + const bestExampleHitCount = bestExampleResults.filter((result) => result.bestExampleHit).length; + const safeTaskReadyCount = safeResults.filter((result) => result.ready).length; + const unsafeTaskAbstainCount = unsafeResults.filter((result) => result.abstain).length; + const unsafeReadyFalsePositiveCount = unsafeResults.filter((result) => result.ready).length; + + return { + totalTasks, + safeTasks: safeResults.length, + unsafeTasks: unsafeResults.length, + targetableTasks: targetableResults.length, + bestExampleTasks: bestExampleResults.length, + topTargetInTop3Count, + topTargetInTop3Rate: + targetableResults.length > 0 ? topTargetInTop3Count / targetableResults.length : null, + averageFirstRelevantHit: + firstRelevantHits.length > 0 + ? firstRelevantHits.reduce((sum, value) => sum + value, 0) / firstRelevantHits.length + : null, + bestExampleHitCount, + bestExampleHitRate: + bestExampleResults.length > 0 ? bestExampleHitCount / bestExampleResults.length : null, + safeTaskReadyCount, + safeTaskReadyRate: safeResults.length > 0 ? safeTaskReadyCount / safeResults.length : null, + unsafeTaskAbstainCount, + unsafeTaskAbstainRate: + unsafeResults.length > 0 ? unsafeTaskAbstainCount / unsafeResults.length : null, + unsafeReadyFalsePositiveCount, + unsafeReadyFalsePositiveRate: + unsafeResults.length > 0 ? unsafeReadyFalsePositiveCount / unsafeResults.length : null, + results + }; +} + +function evaluateTask(task: EditPreflightTask, response: EditPreflightResponse): EditPreflightTaskResult { + const topFiles = (response.results ?? []) + .map((result) => (typeof result.file === 'string' ? stripLocationSuffix(result.file) : '')) + .filter((filePath): filePath is string => Boolean(filePath)); + const firstRelevantHit = findFirstRelevantHit(topFiles, task.expectedTargetPatterns); + const bestExample = + typeof response.preflight?.bestExample === 'string' ? response.preflight.bestExample : null; + const bestExampleHit = + task.expectedBestExamplePatterns && task.expectedBestExamplePatterns.length > 0 + ? bestExample !== null && matchesPatterns(bestExample, task.expectedBestExamplePatterns) + : null; + + return { + taskId: task.id, + title: task.title, + query: task.query, + risk: task.risk, + ready: response.preflight?.ready === true, + abstain: response.preflight?.abstain === true, + searchQualityStatus: response.searchQuality?.status ?? 'unknown', + topFiles, + firstRelevantHit, + topTargetInTop3: + task.expectedTargetPatterns && task.expectedTargetPatterns.length > 0 + ? firstRelevantHit !== null && firstRelevantHit <= 3 + : null, + bestExample, + bestExampleHit, + ...(typeof response.preflight?.nextAction === 'string' && { + nextAction: response.preflight.nextAction + }), + ...(Array.isArray(response.preflight?.warnings) && + response.preflight.warnings.length > 0 && { warnings: response.preflight.warnings }), + ...(Array.isArray(response.preflight?.whatWouldHelp) && + response.preflight.whatWouldHelp.length > 0 && { + whatWouldHelp: response.preflight.whatWouldHelp + }) + }; +} + +async function runSearchPreflight( + task: EditPreflightTask, + rootPath: string +): Promise { + const project = createProjectState(rootPath); + project.indexState.status = 'ready'; + + const response = await searchCodebaseHandle( + { + query: task.query, + intent: 'edit', + limit: task.limit ?? 5 + }, + { + indexState: project.indexState, + paths: project.paths, + rootPath: project.rootPath, + performIndexing: () => undefined + } + ); + const payload = response.content?.[0]?.text ?? '{}'; + const parsed = JSON.parse(payload) as unknown; + + if (typeof parsed === 'object' && parsed !== null) { + return parsed as EditPreflightResponse; + } + + return {}; +} + +export async function evaluateEditPreflightFixture({ + fixture, + rootPath, + runner = runSearchPreflight +}: EvaluateEditPreflightFixtureParams): Promise { + const results: EditPreflightTaskResult[] = []; + + for (const task of fixture.tasks) { + const response = await runner(task, rootPath); + results.push(evaluateTask(task, response)); + } + + return summarizeEditPreflightResults(results); +} + +export function combineEditPreflightSummaries( + summaries: EditPreflightSummary[] +): EditPreflightSummary { + return summarizeEditPreflightResults(summaries.flatMap((summary) => summary.results)); +} + +function formatRate(value: number | null): string { + if (value === null) { + return 'n/a'; + } + + return `${(value * 100).toFixed(0)}%`; +} + +function formatHit(value: number | null): string { + return value === null ? 'n/a' : value.toFixed(2); +} + +export function formatEditPreflightReport({ + codebaseLabel, + fixturePath, + summary +}: FormatEditPreflightReportParams): string { + const lines: string[] = []; + const unsafeFalsePositives = summary.results.filter( + (result) => result.risk === 'unsafe' && result.ready + ); + const safeMisses = summary.results.filter((result) => result.risk === 'safe' && !result.ready); + + lines.push(`\n=== Edit Preflight Eval Report: ${codebaseLabel} ===`); + lines.push(`Fixture: ${fixturePath}`); + lines.push(`Tasks: ${summary.totalTasks} (${summary.safeTasks} safe, ${summary.unsafeTasks} unsafe)`); + lines.push( + `Top-target in top-3: ${summary.topTargetInTop3Count}/${summary.targetableTasks} (${formatRate(summary.topTargetInTop3Rate)})` + ); + lines.push(`Average first relevant hit: ${formatHit(summary.averageFirstRelevantHit)}`); + lines.push( + `Best-example hit rate: ${summary.bestExampleHitCount}/${summary.bestExampleTasks} (${formatRate(summary.bestExampleHitRate)})` + ); + lines.push( + `Safe-task ready rate: ${summary.safeTaskReadyCount}/${summary.safeTasks} (${formatRate(summary.safeTaskReadyRate)})` + ); + lines.push( + `Unsafe-task abstain rate: ${summary.unsafeTaskAbstainCount}/${summary.unsafeTasks} (${formatRate(summary.unsafeTaskAbstainRate)})` + ); + lines.push( + `Unsafe ready=true false-positive rate: ${summary.unsafeReadyFalsePositiveCount}/${summary.unsafeTasks} (${formatRate(summary.unsafeReadyFalsePositiveRate)})` + ); + lines.push(''); + lines.push('Task results:'); + + for (const result of summary.results) { + const taskLine = [ + `- ${result.taskId}`, + `[${result.risk}]`, + `ready=${result.ready ? 'yes' : 'no'}`, + `abstain=${result.abstain ? 'yes' : 'no'}`, + `firstRelevant=${result.firstRelevantHit ?? 'n/a'}`, + `top3=${result.topTargetInTop3 === null ? 'n/a' : result.topTargetInTop3 ? 'hit' : 'miss'}`, + `bestExample=${result.bestExampleHit === null ? 'n/a' : result.bestExampleHit ? 'hit' : 'miss'}`, + `quality=${result.searchQualityStatus}` + ]; + lines.push(taskLine.join(' ')); + } + + lines.push(''); + lines.push('Unsafe false positives:'); + if (unsafeFalsePositives.length === 0) { + lines.push(' (none)'); + } else { + for (const result of unsafeFalsePositives) { + lines.push(` - ${result.taskId}: "${result.query}"`); + } + } + + lines.push(''); + lines.push('Safe misses:'); + if (safeMisses.length === 0) { + lines.push(' (none)'); + } else { + for (const result of safeMisses) { + lines.push(` - ${result.taskId}: "${result.query}"`); + if (result.nextAction) { + lines.push(` next: ${result.nextAction}`); + } + } + } + + lines.push('================================'); + return lines.join('\n'); +} + +export type { EditPreflightRunner }; diff --git a/src/eval/run-config.ts b/src/eval/run-config.ts new file mode 100644 index 0000000..3484a2f --- /dev/null +++ b/src/eval/run-config.ts @@ -0,0 +1,37 @@ +import path from 'path'; + +export type EvalMode = 'retrieval' | 'discovery' | 'edit-preflight'; + +export interface EvalFixtureDefaults { + fixtureA: string; + fixtureB: string; +} + +export function resolveEvalMode(rawMode: string | undefined): EvalMode { + if (rawMode === 'discovery' || rawMode === 'edit-preflight') { + return rawMode; + } + + return 'retrieval'; +} + +export function getDefaultFixturePaths(projectRoot: string, mode: EvalMode): EvalFixtureDefaults { + if (mode === 'discovery') { + return { + fixtureA: path.join(projectRoot, 'tests', 'fixtures', 'discovery-angular-spotify.json'), + fixtureB: path.join(projectRoot, 'tests', 'fixtures', 'discovery-excalidraw.json') + }; + } + + if (mode === 'edit-preflight') { + return { + fixtureA: path.join(projectRoot, 'tests', 'fixtures', 'edit-preflight-angular-spotify.json'), + fixtureB: path.join(projectRoot, 'tests', 'fixtures', 'edit-preflight-excalidraw.json') + }; + } + + return { + fixtureA: path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json'), + fixtureB: path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json') + }; +} diff --git a/src/eval/types.ts b/src/eval/types.ts index a4ced39..d80e777 100644 --- a/src/eval/types.ts +++ b/src/eval/types.ts @@ -64,6 +64,102 @@ export interface FormatEvalReportParams { redactPaths?: boolean; } +export type EditPreflightRisk = 'safe' | 'unsafe'; + +export interface EditPreflightTask { + id: string; + title: string; + query: string; + risk: EditPreflightRisk; + expectedTargetPatterns?: string[]; + expectedBestExamplePatterns?: string[]; + limit?: number; + notes?: string; +} + +export interface EditPreflightFixture { + description?: string; + codebase?: string; + repository?: string; + repositoryUrl?: string; + repositoryRef?: string; + frozenDate?: string; + notes?: string; + tasks: EditPreflightTask[]; +} + +export interface EditPreflightTaskResult { + taskId: string; + title: string; + query: string; + risk: EditPreflightRisk; + ready: boolean; + abstain: boolean; + searchQualityStatus: 'ok' | 'low_confidence' | 'unknown'; + topFiles: string[]; + firstRelevantHit: number | null; + topTargetInTop3: boolean | null; + bestExample: string | null; + bestExampleHit: boolean | null; + nextAction?: string; + warnings?: string[]; + whatWouldHelp?: string[]; +} + +export interface EditPreflightSummary { + totalTasks: number; + safeTasks: number; + unsafeTasks: number; + targetableTasks: number; + bestExampleTasks: number; + topTargetInTop3Count: number; + topTargetInTop3Rate: number | null; + averageFirstRelevantHit: number | null; + bestExampleHitCount: number; + bestExampleHitRate: number | null; + safeTaskReadyCount: number; + safeTaskReadyRate: number | null; + unsafeTaskAbstainCount: number; + unsafeTaskAbstainRate: number | null; + unsafeReadyFalsePositiveCount: number; + unsafeReadyFalsePositiveRate: number | null; + results: EditPreflightTaskResult[]; +} + +export interface EvaluateEditPreflightFixtureParams { + fixture: EditPreflightFixture; + rootPath: string; + runner?: EditPreflightRunner; +} + +export interface FormatEditPreflightReportParams { + codebaseLabel: string; + fixturePath: string; + summary: EditPreflightSummary; +} + +export interface EditPreflightResponse { + preflight?: { + ready?: boolean; + abstain?: boolean; + bestExample?: string; + nextAction?: string; + warnings?: string[]; + whatWouldHelp?: string[]; + }; + searchQuality?: { + status?: 'ok' | 'low_confidence'; + }; + results?: Array<{ + file?: string; + }>; +} + +export type EditPreflightRunner = ( + task: EditPreflightTask, + rootPath: string +) => Promise; + export type DiscoveryJob = 'map' | 'find' | 'search'; export type DiscoverySurface = diff --git a/tests/benchmark-comparators.test.ts b/tests/benchmark-comparators.test.ts index 8863ef4..e10cbd4 100644 --- a/tests/benchmark-comparators.test.ts +++ b/tests/benchmark-comparators.test.ts @@ -8,6 +8,10 @@ async function importHelper() { return import(pathToFileURL(path.resolve(__dirname, '..', 'scripts', 'lib', 'managed-mcp-session.mjs')).href); } +async function importRunner() { + return import(pathToFileURL(path.resolve(__dirname, '..', 'scripts', 'benchmark-comparators.mjs')).href); +} + function isProcessAlive(pid: number): boolean { try { process.kill(pid, 0); @@ -90,3 +94,98 @@ describe('managed MCP benchmark sessions', () => { await waitForProcessExit(pid as number); }); }); + +describe('benchmark comparator aggregation', () => { + it('marks empty task payloads as pending evidence instead of ok', async () => { + const { aggregateResults } = await importRunner(); + const aggregated = aggregateResults([ + { + taskId: 't1', + job: 'search', + surface: 'search_codebase', + usefulnessScore: 0, + matchedSignals: [], + missingSignals: ['results'], + payloadBytes: 19, + estimatedTokens: 5, + toolCallCount: 1, + elapsedMs: 1 + } + ]); + + expect(aggregated.status).toBe('pending_evidence'); + expect(aggregated.reason).toMatch(/usable benchmark evidence/i); + expect(aggregated.averageFirstRelevantHit).toBeNull(); + expect(aggregated.bestExampleUsefulnessRate).toBeNull(); + }); + + it('computes ranked-hit and best-example metrics when task evidence exists', async () => { + const { aggregateResults } = await importRunner(); + const aggregated = aggregateResults([ + { + taskId: 'search-1', + job: 'search', + surface: 'search_codebase', + usefulnessScore: 0.5, + matchedSignals: ['results'], + missingSignals: ['searchQuality'], + payloadBytes: 200, + estimatedTokens: 50, + toolCallCount: 1, + elapsedMs: 10, + firstRelevantHit: 2 + }, + { + taskId: 'find-1', + job: 'find', + surface: 'search_codebase', + usefulnessScore: 1, + matchedSignals: ['bestExample'], + missingSignals: [], + payloadBytes: 220, + estimatedTokens: 55, + toolCallCount: 1, + elapsedMs: 12, + bestExampleUseful: true + } + ]); + + expect(aggregated.status).toBe('ok'); + expect(aggregated.averageFirstRelevantHit).toBe(2); + expect(aggregated.bestExampleUsefulnessRate).toBe(1); + }); +}); + +describe('raw Claude result parsing', () => { + it('extracts files and bestExample from structured Claude output', async () => { + const { parseRawClaudeStructuredResult } = await importRunner(); + const parsed = parseRawClaudeStructuredResult( + JSON.stringify({ + answer: 'Use AuthInterceptor and auth.effects patterns.', + files: ['src/auth/auth.interceptor.ts', 'src/auth/auth.effects.ts'], + bestExample: 'src/auth/auth.interceptor.ts' + }) + ); + + expect(parsed.payload).toContain('AuthInterceptor'); + expect(parsed.topFiles).toEqual([ + 'src/auth/auth.interceptor.ts', + 'src/auth/auth.effects.ts' + ]); + expect(parsed.bestExample).toBe('src/auth/auth.interceptor.ts'); + }); + + it('extracts files and bestExample from fenced JSON Claude output', async () => { + const { parseRawClaudeStructuredResult } = await importRunner(); + const parsed = parseRawClaudeStructuredResult(`\`\`\`json +{"answer":"Use AuthInterceptor and auth.effects patterns.","files":["src/auth/auth.interceptor.ts","src/auth/auth.effects.ts"],"bestExample":"src/auth/auth.interceptor.ts"} +\`\`\``); + + expect(parsed.payload).toContain('AuthInterceptor'); + expect(parsed.topFiles).toEqual([ + 'src/auth/auth.interceptor.ts', + 'src/auth/auth.effects.ts' + ]); + expect(parsed.bestExample).toBe('src/auth/auth.interceptor.ts'); + }); +}); diff --git a/tests/edit-preflight-harness.test.ts b/tests/edit-preflight-harness.test.ts new file mode 100644 index 0000000..6332d15 --- /dev/null +++ b/tests/edit-preflight-harness.test.ts @@ -0,0 +1,243 @@ +import { describe, expect, it } from 'vitest'; +import { + combineEditPreflightSummaries, + evaluateEditPreflightFixture, + formatEditPreflightReport +} from '../src/eval/edit-preflight-harness.js'; +import type { + EditPreflightFixture, + EditPreflightResponse, + EditPreflightSummary +} from '../src/eval/types.js'; +import angularEditPreflightFixture from './fixtures/edit-preflight-angular-spotify.json'; +import excalidrawEditPreflightFixture from './fixtures/edit-preflight-excalidraw.json'; + +describe('Edit preflight fixtures', () => { + it('keeps both public edit-preflight fixtures frozen at 10 tasks each with safe/unsafe balance', () => { + for (const fixture of [angularEditPreflightFixture, excalidrawEditPreflightFixture]) { + expect(fixture.tasks).toHaveLength(10); + const counts = fixture.tasks.reduce>((acc, task) => { + acc[task.risk] = (acc[task.risk] ?? 0) + 1; + return acc; + }, {}); + expect(counts.safe).toBe(6); + expect(counts.unsafe).toBe(4); + } + }); + + it('pins both edit-preflight fixtures to concrete repository refs', () => { + expect(angularEditPreflightFixture.repositoryRef).toMatch(/^[0-9a-f]{40}$/); + expect(excalidrawEditPreflightFixture.repositoryRef).toMatch(/^[0-9a-f]{40}$/); + }); +}); + +describe('Edit preflight harness scoring', () => { + it('scores target hits, best-example hits, safe ready rate, and unsafe abstention deterministically', async () => { + const fixture: EditPreflightFixture = { + tasks: [ + { + id: 'safe-1', + title: 'Safe auth edit', + query: 'edit auth headers', + risk: 'safe', + expectedTargetPatterns: ['auth.interceptor.ts'], + expectedBestExamplePatterns: ['auth.interceptor.ts'] + }, + { + id: 'safe-2', + title: 'Safe player edit', + query: 'edit player flow', + risk: 'safe', + expectedTargetPatterns: ['player-api.ts'], + expectedBestExamplePatterns: ['player-api.ts'] + }, + { + id: 'unsafe-1', + title: 'Unsafe migration', + query: 'rewrite everything', + risk: 'unsafe' + } + ] + }; + + const responses: Record = { + 'edit auth headers': { + preflight: { + ready: true, + bestExample: 'src/http/auth.interceptor.ts' + }, + searchQuality: { status: 'ok' }, + results: [ + { file: 'src/http/auth.interceptor.ts:1-20' }, + { file: 'src/http/error.interceptor.ts:1-20' } + ] + }, + 'edit player flow': { + preflight: { + ready: false, + bestExample: 'src/player/player-api.ts', + nextAction: 'Search for callers before editing.' + }, + searchQuality: { status: 'ok' }, + results: [ + { file: 'src/player/player-helper.ts:1-20' }, + { file: 'src/player/player-api.ts:1-20' } + ] + }, + 'rewrite everything': { + preflight: { + ready: false, + abstain: true, + nextAction: 'Break the request into smaller edits.' + }, + searchQuality: { status: 'low_confidence' }, + results: [{ file: 'src/app/app.ts:1-20' }] + } + }; + + const summary = await evaluateEditPreflightFixture({ + fixture, + rootPath: 'C:/repo', + runner: async (task) => responses[task.query] ?? {} + }); + + expect(summary.totalTasks).toBe(3); + expect(summary.topTargetInTop3Count).toBe(2); + expect(summary.topTargetInTop3Rate).toBe(1); + expect(summary.averageFirstRelevantHit).toBe(1.5); + expect(summary.bestExampleHitRate).toBe(1); + expect(summary.safeTaskReadyRate).toBe(0.5); + expect(summary.unsafeTaskAbstainRate).toBe(1); + expect(summary.unsafeReadyFalsePositiveRate).toBe(0); + }); + + it('combines summaries by recomputing aggregate rates from task results', () => { + const combined = combineEditPreflightSummaries([ + createSummary({ + results: [ + { + taskId: 'safe-1', + title: 'safe-1', + query: 'safe-1', + risk: 'safe', + ready: true, + abstain: false, + searchQualityStatus: 'ok', + topFiles: ['src/auth.ts'], + firstRelevantHit: 1, + topTargetInTop3: true, + bestExample: 'src/auth.ts', + bestExampleHit: true + } + ] + }), + createSummary({ + results: [ + { + taskId: 'unsafe-1', + title: 'unsafe-1', + query: 'unsafe-1', + risk: 'unsafe', + ready: false, + abstain: true, + searchQualityStatus: 'low_confidence', + topFiles: ['src/app.ts'], + firstRelevantHit: null, + topTargetInTop3: null, + bestExample: null, + bestExampleHit: null + } + ] + }) + ]); + + expect(combined.totalTasks).toBe(2); + expect(combined.safeTaskReadyRate).toBe(1); + expect(combined.unsafeTaskAbstainRate).toBe(1); + expect(combined.unsafeReadyFalsePositiveRate).toBe(0); + }); + + it('formats a bounded edit-preflight report with false-positive and safe-miss sections', () => { + const report = formatEditPreflightReport({ + codebaseLabel: 'fixture-repo', + fixturePath: 'tests/fixtures/edit-preflight-angular-spotify.json', + summary: createSummary({ + results: [ + { + taskId: 'safe-1', + title: 'safe-1', + query: 'safe query', + risk: 'safe', + ready: false, + abstain: false, + searchQualityStatus: 'ok', + topFiles: ['src/auth.ts'], + firstRelevantHit: 2, + topTargetInTop3: true, + bestExample: 'src/auth.ts', + bestExampleHit: true, + nextAction: 'Search for callers first.' + }, + { + taskId: 'unsafe-1', + title: 'unsafe-1', + query: 'unsafe query', + risk: 'unsafe', + ready: true, + abstain: false, + searchQualityStatus: 'ok', + topFiles: ['src/app.ts'], + firstRelevantHit: null, + topTargetInTop3: null, + bestExample: null, + bestExampleHit: null + } + ], + totalTasks: 2, + safeTasks: 1, + unsafeTasks: 1, + targetableTasks: 1, + bestExampleTasks: 1, + topTargetInTop3Count: 1, + topTargetInTop3Rate: 1, + averageFirstRelevantHit: 2, + bestExampleHitCount: 1, + bestExampleHitRate: 1, + safeTaskReadyCount: 0, + safeTaskReadyRate: 0, + unsafeTaskAbstainCount: 0, + unsafeTaskAbstainRate: 0, + unsafeReadyFalsePositiveCount: 1, + unsafeReadyFalsePositiveRate: 1 + }) + }); + + expect(report).toContain('Edit Preflight Eval Report'); + expect(report).toContain('Unsafe false positives:'); + expect(report).toContain('Safe misses:'); + expect(report).toContain('next: Search for callers first.'); + }); +}); + +function createSummary(overrides: Partial = {}): EditPreflightSummary { + return { + totalTasks: 0, + safeTasks: 0, + unsafeTasks: 0, + targetableTasks: 0, + bestExampleTasks: 0, + topTargetInTop3Count: 0, + topTargetInTop3Rate: null, + averageFirstRelevantHit: null, + bestExampleHitCount: 0, + bestExampleHitRate: null, + safeTaskReadyCount: 0, + safeTaskReadyRate: null, + unsafeTaskAbstainCount: 0, + unsafeTaskAbstainRate: null, + unsafeReadyFalsePositiveCount: 0, + unsafeReadyFalsePositiveRate: null, + results: [], + ...overrides + }; +} diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md index 18d954c..073c71a 100644 --- a/tests/fixtures/README.md +++ b/tests/fixtures/README.md @@ -1,6 +1,6 @@ # Evaluation Fixtures -This directory contains frozen evaluation sets for testing retrieval and discovery quality. +This directory contains frozen evaluation sets for testing retrieval, discovery, and edit-preflight quality. ## Files @@ -8,6 +8,8 @@ This directory contains frozen evaluation sets for testing retrieval and discove - `eval-controlled.json` - 20 frozen retrieval queries for the in-repo controlled fixture codebase - `discovery-angular-spotify.json` - 12 discovery tasks for `angular-spotify` - `discovery-excalidraw.json` - 12 discovery tasks for `Excalidraw` +- `edit-preflight-angular-spotify.json` - 10 edit-readiness tasks for `angular-spotify` +- `edit-preflight-excalidraw.json` - 10 edit-readiness tasks for `Excalidraw` - `discovery-benchmark-protocol.json` - frozen scope, comparator set, fairness rules, and ship gate for the discovery benchmark ## Running Evaluations @@ -42,6 +44,12 @@ node scripts/run-eval.mjs tests/fixtures/codebases/eval-controlled --mode retrie node scripts/run-eval.mjs /path/to/angular-spotify /path/to/excalidraw --mode discovery ``` +### Run Edit-Preflight Evaluation + +```bash +node scripts/run-eval.mjs /path/to/angular-spotify /path/to/excalidraw --mode edit-preflight +``` + Optional comparator evidence file: ```bash @@ -66,6 +74,15 @@ The discovery harness outputs: - **Average first relevant hit**: position of the first relevant file for search tasks - **Best-example usefulness**: whether find tasks surfaced the expected exemplar +The edit-preflight harness outputs: + +- **Top-target in top-3**: whether the expected edit surface appears within the first three results +- **Average first relevant hit**: average ranking position of the first expected edit surface +- **Best-example hit rate**: whether preflight `bestExample` matches the expected local exemplar +- **Safe-task ready rate**: how often concrete local edits return `ready=true` +- **Unsafe-task abstain rate**: how often broad or migration-scale asks return `abstain=true` +- **Unsafe `ready=true` false-positive rate**: how often unsafe asks are incorrectly marked ready + ## Evaluation Integrity Rules ⚠️ **CRITICAL**: These fixtures are FROZEN. Once committed: @@ -81,6 +98,11 @@ For discovery specifically: 6. **DO NOT** claim implementation quality from this benchmark 7. **DO** keep comparator setup limitations explicit when a lane requires manual log capture +For edit-preflight specifically: + +8. **DO NOT** convert these tasks into patch-quality or autonomous-edit claims +9. **DO** treat unsafe-task false positives as the critical failure signal + ### Proper Usage ✅ **CORRECT**: @@ -167,6 +189,14 @@ git -C /path/to/excalidraw checkout e18c1dd213000dde0ae94ef7eb00aab537b39708 3. Run eval on both pinned repos 4. Compare metrics transparently +### Edit-Preflight Scope + +Edit-preflight mode is intentionally non-comparator and launch-readiness oriented: + +1. It only evaluates the shipped `search_codebase` edit preflight +2. It measures navigation/readiness signals, not code generation quality +3. It keeps safe and unsafe tasks explicit so false positives are visible + ## Discovery Benchmark Scope Phase 5 freezes discovery around three jobs only: diff --git a/tests/fixtures/edit-preflight-angular-spotify.json b/tests/fixtures/edit-preflight-angular-spotify.json new file mode 100644 index 0000000..8004e22 --- /dev/null +++ b/tests/fixtures/edit-preflight-angular-spotify.json @@ -0,0 +1,93 @@ +{ + "description": "Frozen edit-preflight tasks for angular-spotify. This suite measures readiness and abstention behavior, not autonomous edit quality.", + "codebase": "angular-spotify", + "repository": "trungk18/angular-spotify", + "repositoryUrl": "https://github.com/trungk18/angular-spotify", + "repositoryRef": "ff9efa765c53cfde78c9a172c62d515ae8ef9fe0", + "frozenDate": "2026-04-17", + "notes": "Safe tasks are concrete local edits. Unsafe tasks are intentionally broad or high-impact and should not be used to justify ready=true claims without stronger evidence.", + "tasks": [ + { + "id": "as-ep-01", + "title": "Tight auth header edit", + "query": "update how authorization token headers are attached to API requests", + "risk": "safe", + "expectedTargetPatterns": ["auth", "interceptor"], + "expectedBestExamplePatterns": ["auth", "interceptor"], + "notes": "A local interceptor change should be navigable with an edit-ready preflight." + }, + { + "id": "as-ep-02", + "title": "Playback next-track behavior", + "query": "change the logic that skips to the next song", + "risk": "safe", + "expectedTargetPatterns": ["player-api", "player/api"], + "expectedBestExamplePatterns": ["player"], + "notes": "Targets the playback API surface used in retrieval/discovery fixtures." + }, + { + "id": "as-ep-03", + "title": "Recently played fetch flow", + "query": "edit how recently played tracks are fetched", + "risk": "safe", + "expectedTargetPatterns": ["player", "recent"], + "expectedBestExamplePatterns": ["player"], + "notes": "Concrete API behavior with a narrow blast radius." + }, + { + "id": "as-ep-04", + "title": "Album store loading flow", + "query": "modify how saved albums are loaded in application state", + "risk": "safe", + "expectedTargetPatterns": ["album", "store"], + "expectedBestExamplePatterns": ["album", "store"], + "notes": "A focused state-management edit." + }, + { + "id": "as-ep-05", + "title": "Local persistence helper", + "query": "change how browser session data is persisted locally", + "risk": "safe", + "expectedTargetPatterns": ["storage", "local-storage"], + "expectedBestExamplePatterns": ["storage", "local-storage"], + "notes": "Targets the local storage service." + }, + { + "id": "as-ep-06", + "title": "Playlist API request helper", + "query": "update playlist API request handling", + "risk": "safe", + "expectedTargetPatterns": ["playlist", "api"], + "expectedBestExamplePatterns": ["playlist", "api"], + "notes": "Exact-name adjacent edit on a concrete service." + }, + { + "id": "as-ep-07", + "title": "Rewrite auth behavior everywhere", + "query": "rewrite every auth-related request and token flow across the app in one change", + "risk": "unsafe", + "notes": "Broad, cross-cutting request that should not be treated as a ready local edit." + }, + { + "id": "as-ep-08", + "title": "Migrate all NgRx state at once", + "query": "replace all ngrx state management with a new pattern across the whole app", + "risk": "unsafe", + "notes": "Migration-scale ask with intentionally high impact." + }, + { + "id": "as-ep-09", + "title": "Refactor every interceptor path", + "query": "refactor all interceptors and token refresh behavior throughout the repository", + "risk": "unsafe", + "notes": "Multiple coupled subsystems, not a single safe edit target." + }, + { + "id": "as-ep-10", + "title": "Remove analytics globally", + "query": "remove every analytics and tracking hook from the entire app", + "risk": "unsafe", + "notes": "Repository-wide removal request intended to test abstention." + } + ] +} diff --git a/tests/fixtures/edit-preflight-excalidraw.json b/tests/fixtures/edit-preflight-excalidraw.json new file mode 100644 index 0000000..1a45a27 --- /dev/null +++ b/tests/fixtures/edit-preflight-excalidraw.json @@ -0,0 +1,93 @@ +{ + "description": "Frozen edit-preflight tasks for Excalidraw. This suite measures whether the current preflight finds the right edit surface and abstains on unsafe asks.", + "codebase": "Excalidraw", + "repository": "excalidraw/excalidraw", + "repositoryUrl": "https://github.com/excalidraw/excalidraw", + "repositoryRef": "e18c1dd213000dde0ae94ef7eb00aab537b39708", + "frozenDate": "2026-04-17", + "notes": "Safe tasks stay local to a scene, element, serialization, or app-state surface. Unsafe tasks intentionally span multiple subsystems or migration-scale edits.", + "tasks": [ + { + "id": "ex-ep-01", + "title": "Scene update flow", + "query": "change how scene updates are applied", + "risk": "safe", + "expectedTargetPatterns": ["scene"], + "expectedBestExamplePatterns": ["scene"], + "notes": "Focused scene-edit behavior used in current discovery coverage." + }, + { + "id": "ex-ep-02", + "title": "Element type definitions", + "query": "edit element type definitions", + "risk": "safe", + "expectedTargetPatterns": ["element", "type"], + "expectedBestExamplePatterns": ["element", "type"], + "notes": "Concrete type-oriented edit surface." + }, + { + "id": "ex-ep-03", + "title": "Scene JSON serialization", + "query": "modify scene serialization to json export", + "risk": "safe", + "expectedTargetPatterns": ["scene", "json", "data"], + "expectedBestExamplePatterns": ["scene", "json", "data"], + "notes": "Narrow export/serialization edit." + }, + { + "id": "ex-ep-04", + "title": "App state selection flow", + "query": "change app state selection and update logic", + "risk": "safe", + "expectedTargetPatterns": ["appstate", "state", "app"], + "expectedBestExamplePatterns": ["appstate", "state", "app"], + "notes": "Local app-state behavior." + }, + { + "id": "ex-ep-05", + "title": "Canvas entry interaction", + "query": "edit the main canvas app entry behavior", + "risk": "safe", + "expectedTargetPatterns": ["app", "excalidraw", "canvas"], + "expectedBestExamplePatterns": ["app", "excalidraw", "canvas"], + "notes": "Concrete entry-surface edit." + }, + { + "id": "ex-ep-06", + "title": "Element mutation helper", + "query": "change how elements are updated after scene edits", + "risk": "safe", + "expectedTargetPatterns": ["element", "scene"], + "expectedBestExamplePatterns": ["element", "scene"], + "notes": "Targets the local element mutation path without asking for repo-wide migration." + }, + { + "id": "ex-ep-07", + "title": "Rewrite scene mutation architecture", + "query": "rewrite all scene mutation flows across the whole app in one pass", + "risk": "unsafe", + "notes": "Broad architectural request intended to trigger abstention." + }, + { + "id": "ex-ep-08", + "title": "Replace state model globally", + "query": "migrate every app state update path to a new state architecture", + "risk": "unsafe", + "notes": "Migration-scale change across the repository." + }, + { + "id": "ex-ep-09", + "title": "Refactor export and collaboration together", + "query": "change the entire export pipeline and collaboration serialization at once", + "risk": "unsafe", + "notes": "Coupled multi-subsystem change that should not look edit-ready from one search." + }, + { + "id": "ex-ep-10", + "title": "Rename all element concepts", + "query": "rename every element type and related references across the repo", + "risk": "unsafe", + "notes": "Repository-wide rename intended to test unsafe ready=true false positives." + } + ] +} diff --git a/tests/run-eval-config.test.ts b/tests/run-eval-config.test.ts new file mode 100644 index 0000000..3667473 --- /dev/null +++ b/tests/run-eval-config.test.ts @@ -0,0 +1,25 @@ +import path from 'path'; +import { describe, expect, it } from 'vitest'; +import { getDefaultFixturePaths, resolveEvalMode } from '../src/eval/run-config.js'; + +describe('run-eval mode config', () => { + it('recognizes edit-preflight as a first-class eval mode', () => { + expect(resolveEvalMode('edit-preflight')).toBe('edit-preflight'); + expect(resolveEvalMode('discovery')).toBe('discovery'); + expect(resolveEvalMode('retrieval')).toBe('retrieval'); + }); + + it('keeps retrieval as the fallback mode for unknown values', () => { + expect(resolveEvalMode('unknown-mode')).toBe('retrieval'); + expect(resolveEvalMode(undefined)).toBe('retrieval'); + }); + + it('returns dedicated frozen default fixtures for edit-preflight mode', () => { + const defaults = getDefaultFixturePaths('C:/repo', 'edit-preflight'); + + expect(defaults).toEqual({ + fixtureA: path.join('C:/repo', 'tests', 'fixtures', 'edit-preflight-angular-spotify.json'), + fixtureB: path.join('C:/repo', 'tests', 'fixtures', 'edit-preflight-excalidraw.json') + }); + }); +});