Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
465 changes: 409 additions & 56 deletions scripts/benchmark-comparators.mjs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion scripts/lib/managed-mcp-session.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import process from 'node:process';

async function loadSdkClient() {
const [{ Client }, { StdioClientTransport }] = await Promise.all([
import('@modelcontextprotocol/sdk/client/index.js'),
import('@modelcontextprotocol/sdk/client'),
import('@modelcontextprotocol/sdk/client/stdio.js')
]);

Expand Down
88 changes: 66 additions & 22 deletions scripts/run-eval.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,25 @@ import { analyzerRegistry } from '../dist/core/analyzer-registry.js';
import { AngularAnalyzer } from '../dist/analyzers/angular/index.js';
import { GenericAnalyzer } from '../dist/analyzers/generic/index.js';
import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js';
import {
combineEditPreflightSummaries,
evaluateEditPreflightFixture,
formatEditPreflightReport
} from '../dist/eval/edit-preflight-harness.js';
import {
combineDiscoverySummaries,
evaluateDiscoveryGate,
evaluateDiscoveryFixture,
formatDiscoveryReport
} from '../dist/eval/discovery-harness.js';
import { getDefaultFixturePaths, resolveEvalMode } from '../dist/eval/run-config.js';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const projectRoot = path.join(__dirname, '..');
const packageJsonPath = path.join(projectRoot, 'package.json');

const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));

const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json');
const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json');
const defaultDiscoveryFixtureA = path.join(
projectRoot,
'tests',
'fixtures',
'discovery-angular-spotify.json'
);
const defaultDiscoveryFixtureB = path.join(
projectRoot,
'tests',
'fixtures',
'discovery-excalidraw.json'
);
const defaultDiscoveryProtocol = path.join(
projectRoot,
'tests',
Expand All @@ -49,7 +41,7 @@ const usage = [
`Usage: node scripts/run-eval.mjs <codebaseA> [codebaseB] [options]`,
``,
`Options:`,
` --mode=<retrieval|discovery> Select benchmark mode (default: retrieval)`,
` --mode=<retrieval|discovery|edit-preflight> Select benchmark mode (default: retrieval)`,
` --fixture-a=<path> Override fixture for codebaseA`,
` --fixture-b=<path> Override fixture for codebaseB`,
` --protocol=<path> Override discovery benchmark protocol`,
Expand Down Expand Up @@ -151,6 +143,17 @@ async function runSingleEvaluation({
fixturePath: resolvedFixture,
summary
});
} else if (mode === 'edit-preflight') {
console.log(`\n--- Phase 2: Running ${fixture.tasks.length}-task edit-preflight harness ---`);
summary = await evaluateEditPreflightFixture({
fixture,
rootPath: resolvedCodebase
});
report = formatEditPreflightReport({
codebaseLabel: label,
fixturePath: resolvedFixture,
summary
});
} else {
console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`);
const searcher = new CodebaseSearcher(resolvedCodebase);
Expand Down Expand Up @@ -202,6 +205,31 @@ function printCombinedSummary(summaries, mode) {
return;
}

if (mode === 'edit-preflight') {
const combined = combineEditPreflightSummaries(summaries);
console.log(`\n=== Combined Edit Preflight Summary ===`);
console.log(
`Top-target in top-3: ${combined.topTargetInTop3Count}/${combined.targetableTasks} (${combined.topTargetInTop3Rate === null ? 'n/a' : (combined.topTargetInTop3Rate * 100).toFixed(0) + '%'})`
);
console.log(
`Average first relevant hit: ${combined.averageFirstRelevantHit === null ? 'n/a' : combined.averageFirstRelevantHit.toFixed(2)}`
);
console.log(
`Best-example hit rate: ${combined.bestExampleHitCount}/${combined.bestExampleTasks} (${combined.bestExampleHitRate === null ? 'n/a' : (combined.bestExampleHitRate * 100).toFixed(0) + '%'})`
);
console.log(
`Safe ready rate: ${combined.safeTaskReadyCount}/${combined.safeTasks} (${combined.safeTaskReadyRate === null ? 'n/a' : (combined.safeTaskReadyRate * 100).toFixed(0) + '%'})`
);
console.log(
`Unsafe abstain rate: ${combined.unsafeTaskAbstainCount}/${combined.unsafeTasks} (${combined.unsafeTaskAbstainRate === null ? 'n/a' : (combined.unsafeTaskAbstainRate * 100).toFixed(0) + '%'})`
);
console.log(
`Unsafe ready=true false positives: ${combined.unsafeReadyFalsePositiveCount}/${combined.unsafeTasks} (${combined.unsafeReadyFalsePositiveRate === null ? 'n/a' : (combined.unsafeReadyFalsePositiveRate * 100).toFixed(0) + '%'})`
);
console.log(`=======================================\n`);
return;
}

const total = summaries.reduce((sum, summary) => sum + summary.total, 0);
const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0);
const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0);
Expand Down Expand Up @@ -254,17 +282,14 @@ async function main() {

const codebaseA = positionals[0];
const codebaseB = positionals[1];
const mode = values.mode === 'discovery' ? 'discovery' : 'retrieval';
const mode = resolveEvalMode(values.mode);
const defaultFixtures = getDefaultFixturePaths(projectRoot, mode);
const fixtureA = values['fixture-a']
? path.resolve(values['fixture-a'])
: mode === 'discovery'
? defaultDiscoveryFixtureA
: defaultFixtureA;
: defaultFixtures.fixtureA;
const fixtureB = values['fixture-b']
? path.resolve(values['fixture-b'])
: mode === 'discovery'
? defaultDiscoveryFixtureB
: defaultFixtureB;
: defaultFixtures.fixtureB;
const protocolPath = values.protocol
? path.resolve(values.protocol)
: defaultDiscoveryProtocol;
Expand Down Expand Up @@ -326,6 +351,25 @@ async function main() {
process.exit(gate.status === 'failed' ? 1 : 0);
}

if (mode === 'edit-preflight') {
const combinedSummary = combineEditPreflightSummaries(summaries);
printCombinedSummary(summaries, mode);
console.log(
formatEditPreflightReport({
codebaseLabel: 'combined-suite',
fixturePath: codebaseB ? `${fixtureA}, ${fixtureB}` : fixtureA,
summary: combinedSummary
})
);
if (outputPath) {
const outputDir = path.dirname(outputPath);
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
writeFileSync(outputPath, JSON.stringify(combinedSummary, null, 2));
console.log(`\nResults written to: ${outputPath}`);
}
process.exit(0);
}

if (outputPath && mode === 'discovery' && summaries.length === 1) {
const outputDir = path.dirname(outputPath);
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
Expand Down
Loading
Loading