Skip to content

Commit 44ef1c1

Browse files
committed
Save traces option for buffbench
1 parent 5181f84 commit 44ef1c1

File tree

5 files changed

+32
-0
lines changed

5 files changed

+32
-0
lines changed

evals/buffbench/main-hard-tasks.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ function loadTaskIds(evalPath: string): string[] {
1313
}
1414

1515
async function main() {
16+
const saveTraces = process.argv.includes('--save-traces')
17+
1618
const evalPaths = [
1719
path.join(__dirname, 'eval-codebuff2.json'),
1820
path.join(__dirname, 'eval-manifold2.json'),
@@ -33,6 +35,7 @@ async function main() {
3335
agents: ['base2', 'external:claude'],
3436
taskIds: allTaskIds,
3537
taskConcurrency: 4,
38+
saveTraces,
3639
})
3740

3841
process.exit(0)

evals/buffbench/main-nightly.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import type { MetaAnalysisResult } from './meta-analyzer'
88
import type { AgentEvalResults } from './types'
99

1010
async function main() {
11+
const saveTraces = process.argv.includes('--save-traces')
12+
1113
console.log('Starting nightly buffbench evaluation...')
1214
console.log('Eval set: codebuff')
1315
console.log()
@@ -16,6 +18,7 @@ async function main() {
1618
evalDataPaths: [ path.join(__dirname, 'eval-codebuff.json')],
1719
agents: ['base2-free'],
1820
taskConcurrency: 3,
21+
saveTraces,
1922
})
2023

2124
console.log('\nNightly buffbench evaluation completed successfully!')

evals/buffbench/main-single-eval.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@ import path from 'path'
33
import { runBuffBench } from './run-buffbench'
44

55
async function main() {
6+
const saveTraces = process.argv.includes('--save-traces')
7+
68
await runBuffBench({
79
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
810
agents: ['base2'],
911
taskIds: ['filter-system-history'],
12+
saveTraces,
1013
})
1114

1215
process.exit(0)

evals/buffbench/main.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@ import path from 'path'
33
import { runBuffBench } from './run-buffbench'
44

55
async function main() {
6+
const saveTraces = process.argv.includes('--save-traces')
7+
68
// Compare Codebuff agents against external CLI agents
79
// Use 'external:claude' for Claude Code CLI
810
// Use 'external:codex' for OpenAI Codex CLI
911
await runBuffBench({
1012
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
1113
agents: ['base2-free'],
1214
taskConcurrency: 5,
15+
saveTraces,
1316
})
1417

1518
process.exit(0)

evals/buffbench/run-buffbench.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ async function runTask(options: {
5757
printEvents: boolean
5858
finalCheckCommands?: string[]
5959
disableAnalysis?: boolean
60+
saveTraces?: boolean
6061
}) {
6162
const {
6263
client,
@@ -74,6 +75,7 @@ async function runTask(options: {
7475
printEvents,
7576
finalCheckCommands,
7677
disableAnalysis,
78+
saveTraces = false,
7779
} = options
7880

7981
console.log(
@@ -173,6 +175,21 @@ async function runTask(options: {
173175
finalCheckOutputs: agentResult.finalCheckOutputs,
174176
})
175177

178+
// Save judge traces to separate files if saveTraces is enabled
179+
if (saveTraces) {
180+
const tracesDir = path.join(logsDir, 'traces')
181+
if (!fs.existsSync(tracesDir)) {
182+
fs.mkdirSync(tracesDir, { recursive: true })
183+
}
184+
185+
// Save agent trace only (not judge traces)
186+
const agentTracePath = path.join(
187+
tracesDir,
188+
`${index + 1}-${safeTaskId}-${safeAgentId}-${safeCommitShort}-agent.json`,
189+
)
190+
fs.writeFileSync(agentTracePath, JSON.stringify(agentResult.trace, null, 2))
191+
}
192+
176193
fs.writeFileSync(
177194
tracePath,
178195
JSON.stringify(commitTraces[commitTraces.length - 1], null, 2),
@@ -300,6 +317,7 @@ export async function runBuffBench(options: {
300317
taskIds?: string[]
301318
extractLessons?: boolean
302319
disableAnalysis?: boolean
320+
saveTraces?: boolean
303321
}) {
304322
const {
305323
evalDataPaths,
@@ -308,6 +326,7 @@ export async function runBuffBench(options: {
308326
taskIds,
309327
extractLessons = false,
310328
disableAnalysis = false,
329+
saveTraces = false,
311330
} = options
312331

313332
if (evalDataPaths.length === 0) {
@@ -453,6 +472,7 @@ export async function runBuffBench(options: {
453472
printEvents: agents.length === 1 && taskConcurrency === 1,
454473
finalCheckCommands: evalData.finalCheckCommands,
455474
disableAnalysis,
475+
saveTraces,
456476
}),
457477
)
458478
})

0 commit comments

Comments
 (0)