EntityProcess · christso · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/apps/cli/src/commands/results/remote.ts b/apps/cli/src/commands/results/remote.ts
@@ -8,7 +8,9 @@ import {
   directPushResults,
   directorySizeBytes,
   getResultsRepoStatus,
+  listGitRuns,
   loadConfig,
+  normalizeResultsConfig,
   resolveResultsRepoRunsDir,
   syncResultsRepo,
 } from '@agentv/core';
@@ -59,15 +61,6 @@ function getStatusMessage(error: unknown): string {
   return error instanceof Error ? error.message : String(error);
 }
 
-function normalizeResultsConfig(config: ResultsConfig): Required<ResultsConfig> {
-  return {
-    repo: config.repo,
-    path: config.path,
-    auto_push: config.auto_push === true,
-    branch_prefix: config.branch_prefix?.trim() || 'eval-results',
-  };
-}
-
 function statusForResult(result: EvaluationResult): 'PASS' | 'FAIL' | 'ERROR' {
   if (result.executionStatus === 'execution_error' || result.error) {
     return 'ERROR';
@@ -185,15 +178,45 @@ export async function listMergedResultFiles(
     };
   }
 
-  const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
-    (meta) =>
-      ({
-        ...meta,
-        filename: encodeRemoteRunId(meta.filename),
-        raw_filename: meta.filename,
+  let remoteRuns: SourcedResultFileMeta[] = [];
+  if (config.mode === 'github') {
+    try {
+      const gitRuns = await listGitRuns(config.path);
+      remoteRuns = gitRuns.map((r) => ({
+        filename: encodeRemoteRunId(r.run_id),
+        raw_filename: r.run_id,
         source: 'remote' as const,
-      }) satisfies SourcedResultFileMeta,
-  );
+        path: path.join(config.path, r.manifest_path),
+        displayName: r.display_name,
+        timestamp: r.timestamp,
+        testCount: r.test_count,
+        passRate: r.pass_rate || 0,
+        avgScore: r.avg_score || 0,
+        sizeBytes: r.size_bytes || 0,
+      }));
+    } catch (error) {
+      console.error('git-native listing failed, falling back', error);
+      remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
+        (meta) =>
+          ({
+            ...meta,
+            filename: encodeRemoteRunId(meta.filename),
+            raw_filename: meta.filename,
+            source: 'remote' as const,
+          }) satisfies SourcedResultFileMeta,
+      );
+    }
+  } else {
+    remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
+      (meta) =>
+        ({
+          ...meta,
+          filename: encodeRemoteRunId(meta.filename),
+          raw_filename: meta.filename,
+          source: 'remote' as const,
+        }) satisfies SourcedResultFileMeta,
+    );
+  }
 
   const merged = [...localRuns, ...remoteRuns].sort((a, b) =>
     b.timestamp.localeCompare(a.timestamp),

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -274,49 +274,103 @@ function inferExperimentFromRunId(runId: string): string | undefined {
   return experiment;
 }
 
+const DEFAULT_RUN_PAGE_LIMIT = 50;
+
+function parseRunPageLimit(limitParam: string | undefined): number | undefined | null {
+  if (limitParam === undefined) {
+    return undefined;
+  }
+  if (!/^\d+$/.test(limitParam)) {
+    return null;
+  }
+  const limit = Number.parseInt(limitParam, 10);
+  return limit > 0 ? limit : null;
+}
+
+function paginateRuns<T extends { filename: string }>(
+  runs: T[],
+  cursor: string | undefined,
+  limit: number | undefined,
+): { runs: T[]; nextCursor?: string } {
+  if (limit === undefined) {
+    return { runs };
+  }
+
+  if (!cursor) {
+    const page = runs.slice(0, limit);
+    return {
+      runs: page,
+      ...(limit < runs.length && page.length > 0 ? { nextCursor: page.at(-1)?.filename } : {}),
+    };
+  }
+
+  const cursorIndex = runs.findIndex((run) => run.filename === cursor);
+  if (cursorIndex === -1) {
+    return { runs: [] };
+  }
+
+  const page = runs.slice(cursorIndex + 1, cursorIndex + 1 + limit);
+  return {
+    runs: page,
+    ...(cursorIndex + 1 + limit < runs.length && page.length > 0
+      ? { nextCursor: page.at(-1)?.filename }
+      : {}),
+  };
+}
+
 async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
   const { runs: metas } = await listMergedResultFiles(searchDir);
   const { threshold: passThreshold } = loadStudioConfig(agentvDir);
-  return c.json({
-    runs: metas.map((m) => {
-      let target: string | undefined;
-      let experiment = inferExperimentFromRunId(m.raw_filename);
-      let passRate = m.passRate;
-      try {
-        const records = loadLightweightResults(m.path);
-        if (records.length > 0) {
-          target = records[0].target;
-          experiment = records[0].experiment ?? experiment;
-          passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
-        } else {
-          // Run is in-progress with 0 results written yet — fall back to the
-          // in-memory target stored when the Studio launched this run.
-          target = getActiveRunTarget(m.path);
-        }
-      } catch {
-        // ignore enrichment errors
+  const parsedLimit = parseRunPageLimit(c.req.query('limit'));
+  if (parsedLimit === null) {
+    return c.json({ error: 'limit must be a positive integer' }, 400);
+  }
+
+  const cursor = c.req.query('cursor');
+  const limit = parsedLimit ?? (cursor ? DEFAULT_RUN_PAGE_LIMIT : undefined);
+  const runs = metas.map((m) => {
+    let target: string | undefined;
+    let experiment = inferExperimentFromRunId(m.raw_filename);
+    let passRate = m.passRate;
+    try {
+      const records = loadLightweightResults(m.path);
+      if (records.length > 0) {
+        target = records[0].target;
+        experiment = records[0].experiment ?? experiment;
+        passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
+      } else {
+        // Run is in-progress with 0 results written yet — fall back to the
+        // in-memory target stored when the Studio launched this run.
+        target = getActiveRunTarget(m.path);
       }
-      // Surface live status for Studio-launched runs that are still starting
-      // or running so the RunList can render a spinner instead of the
-      // pass/fail dot derived from a 0% pass rate.
-      const liveStatus = getActiveRunStatus(m.path);
-      const tagsEntry = readRunTags(m.path);
-      return {
-        filename: m.filename,
-        display_name: m.displayName,
-        path: m.path,
-        timestamp: m.timestamp,
-        test_count: m.testCount,
-        pass_rate: passRate,
-        avg_score: m.avgScore,
-        size_bytes: m.sizeBytes,
-        source: m.source,
-        ...(target && { target }),
-        ...(experiment && { experiment }),
-        ...(tagsEntry && { tags: tagsEntry.tags }),
-        ...(liveStatus && { status: liveStatus }),
-      };
-    }),
+    } catch {
+      // ignore enrichment errors
+    }
+    // Surface live status for Studio-launched runs that are still starting
+    // or running so the RunList can render a spinner instead of the
+    // pass/fail dot derived from a 0% pass rate.
+    const liveStatus = getActiveRunStatus(m.path);
+    const tagsEntry = readRunTags(m.path);
+    return {
+      filename: m.filename,
+      display_name: m.displayName,
+      path: m.path,
+      timestamp: m.timestamp,
+      test_count: m.testCount,
+      pass_rate: passRate,
+      avg_score: m.avgScore,
+      size_bytes: m.sizeBytes,
+      source: m.source,
+      ...(target && { target }),
+      ...(experiment && { experiment }),
+      ...(tagsEntry && { tags: tagsEntry.tags }),
+      ...(liveStatus && { status: liveStatus }),
+    };
+  });
+  const page = paginateRuns(runs, cursor, limit);
+  return c.json({
+    runs: page.runs,
+    ...(page.nextCursor ? { next_cursor: page.nextCursor } : {}),
   });
 }
 

diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
@@ -1,66 +1,77 @@
-import { mkdir, readFile, rm, writeFile } from 'node:fs/promises';
+import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
 import { join } from 'node:path';
-import { afterEach, describe, expect, it } from 'vitest';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
 
 const FIXTURE_DIR = join(import.meta.dirname, 'fixtures');
-const OUT_DIR = join(import.meta.dirname, '__tmp_pipeline_e2e__');
 const CLI_ENTRY = join(import.meta.dirname, '../../../../src/cli.ts');
 const EVAL_PATH = join(FIXTURE_DIR, 'input-test.eval.yaml');
+const PIPELINE_E2E_TIMEOUT_MS = 60_000;
 
 describe('eval pipeline e2e', () => {
+  let outDir: string;
+
+  beforeEach(async () => {
+    outDir = await mkdtemp(join(tmpdir(), 'agentv-pipeline-e2e-'));
+  });
+
   afterEach(async () => {
-    await rm(OUT_DIR, { recursive: true, force: true });
+    await rm(outDir, { recursive: true, force: true });
   });
 
-  it('runs full input → grade → bench pipeline', async () => {
-    const { execa } = await import('execa');
+  it(
+    'runs full input → grade → bench pipeline',
+    async () => {
+      const { execa } = await import('execa');
 
-    // Step 1: pipeline input
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);
-    const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
-    expect(manifest.test_ids).toEqual(['test-01']);
+      // Step 1: pipeline input
+      await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', outDir]);
+      const manifest = JSON.parse(await readFile(join(outDir, 'manifest.json'), 'utf8'));
+      expect(manifest.test_ids).toEqual(['test-01']);
 
-    // Step 2: Write mock response.md (simulating target execution)
-    await writeFile(join(OUT_DIR, 'input-test', 'test-01', 'response.md'), 'hello world response');
+      // Step 2: Write mock response.md (simulating target execution)
+      await writeFile(join(outDir, 'input-test', 'test-01', 'response.md'), 'hello world response');
 
-    // Step 3: pipeline grade
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', OUT_DIR]);
-    const gradeResult = JSON.parse(
-      await readFile(
-        join(OUT_DIR, 'input-test', 'test-01', 'code_grader_results', 'contains_hello.json'),
-        'utf8',
-      ),
-    );
-    expect(gradeResult.score).toBe(1);
+      // Step 3: pipeline grade
+      await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', outDir]);
+      const gradeResult = JSON.parse(
+        await readFile(
+          join(outDir, 'input-test', 'test-01', 'code_grader_results', 'contains_hello.json'),
+          'utf8',
+        ),
+      );
+      expect(gradeResult.score).toBe(1);
 
-    // Step 4: Write mock LLM grader result to disk, then run pipeline bench
-    const llmResultsDir = join(OUT_DIR, 'input-test', 'test-01', 'llm_grader_results');
-    await mkdir(llmResultsDir, { recursive: true });
-    await writeFile(
-      join(llmResultsDir, 'relevance.json'),
-      JSON.stringify({
-        score: 0.9,
-        assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
-      }),
-    );
-    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
+      // Step 4: Write mock LLM grader result to disk, then run pipeline bench
+      const llmResultsDir = join(outDir, 'input-test', 'test-01', 'llm_grader_results');
+      await mkdir(llmResultsDir, { recursive: true });
+      await writeFile(
+        join(llmResultsDir, 'relevance.json'),
+        JSON.stringify({
+          score: 0.9,
+          assertions: [{ text: 'Response is relevant', passed: true, evidence: 'echoes input' }],
+        }),
+      );
+      await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', outDir]);
 
-    // Verify final artifacts
-    const grading = JSON.parse(
-      await readFile(join(OUT_DIR, 'input-test', 'test-01', 'grading.json'), 'utf8'),
-    );
-    expect(grading.graders).toHaveLength(2);
-    expect(grading.summary.pass_rate).toBeGreaterThan(0);
+      // Verify final artifacts
+      const grading = JSON.parse(
+        await readFile(join(outDir, 'input-test', 'test-01', 'grading.json'), 'utf8'),
+      );
+      expect(grading.graders).toHaveLength(2);
+      expect(grading.summary.pass_rate).toBeGreaterThan(0);
 
-    const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
-    const indexLines = indexContent
-      .trim()
-      .split('\n')
-      .map((line) => JSON.parse(line));
-    expect(indexLines).toHaveLength(1);
-    expect(indexLines[0].test_id).toBe('test-01');
+      const indexContent = await readFile(join(outDir, 'index.jsonl'), 'utf8');
+      const indexLines = indexContent
+        .trim()
+        .split('\n')
+        .map((line) => JSON.parse(line));
+      expect(indexLines).toHaveLength(1);
+      expect(indexLines[0].test_id).toBe('test-01');
 
-    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
-    expect(benchmark.run_summary).toBeDefined();
-  }, 30_000);
+      const benchmark = JSON.parse(await readFile(join(outDir, 'benchmark.json'), 'utf8'));
+      expect(benchmark.run_summary).toBeDefined();
+    },
+    PIPELINE_E2E_TIMEOUT_MS,
+  );
 });