diff --git a/packages/app/cypress/e2e/measured-power-overlay.cy.ts b/packages/app/cypress/e2e/measured-power-overlay.cy.ts new file mode 100644 index 00000000..b8b0191e --- /dev/null +++ b/packages/app/cypress/e2e/measured-power-overlay.cy.ts @@ -0,0 +1,55 @@ +// Verifies the new measured-power Y-axis options render on the unofficial-run +// overlay path against a real GitHub Actions artifact (run 26312107787 — the +// on-PR sweep for PR #1558 / qwen3.5-fp8-h200-sglang). This is the canonical +// "preview before merge" test path per CLAUDE.md's overlay requirement. + +describe('Measured power on unofficial-run overlay', () => { + beforeEach(() => { + cy.visit('/inference?unofficialrun=26312107787', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + win.localStorage.setItem('inferencex-feature-gate', '1'); + }, + }); + cy.get('[data-testid="inference-chart-display"]', { timeout: 30_000 }).should('exist'); + }); + + it('exposes the Measured Energy dropdown group and renders overlay points', () => { + // Open Y-axis dropdown + cy.get('[data-testid="yaxis-metric-selector"]').click(); + cy.get('[data-slot="select-content"]').should('exist'); + + // Verify the gated "Measured Energy" group + both options. The select list is a + // scroll container (max-h-72 overflow-y-auto), and this group sits below the fold, + // so scroll each target into view before asserting visibility. + cy.contains('[data-slot="select-content"]', 'Measured Energy') + .scrollIntoView() + .should('be.visible'); + cy.contains('[role="option"]', 'Measured Average Power per GPU') + .scrollIntoView() + .should('be.visible'); + cy.contains('[role="option"]', 'Measured Joules per Output Token') + .scrollIntoView() + .should('be.visible'); + + // Select the power option + cy.contains('[role="option"]', 'Measured Average Power per GPU').click(); + cy.get('[data-slot="select-content"]').should('not.exist'); + + // Initial-load screenshot + cy.screenshot('measured-power-selected', { capture: 'viewport' }); + + // The chart should now contain SVG + / elements + // (overlay points typically render as triangles). Existence is enough — + // visual correctness is reviewed in the screenshot. + cy.get('[data-testid="inference-chart-display"] svg', { timeout: 10_000 }).should('exist'); + }); + + it('switches to Measured Joules per Output Token without errors', () => { + cy.get('[data-testid="yaxis-metric-selector"]').click(); + cy.contains('[role="option"]', 'Measured Joules per Output Token').click(); + cy.get('[data-slot="select-content"]').should('not.exist'); + cy.screenshot('measured-joules-selected', { capture: 'viewport' }); + cy.get('[data-testid="inference-chart-display"] svg').should('exist'); + }); +}); diff --git a/packages/app/src/app/api/unofficial-run/route.test.ts b/packages/app/src/app/api/unofficial-run/route.test.ts index 2f317c71..be71324b 100644 --- a/packages/app/src/app/api/unofficial-run/route.test.ts +++ b/packages/app/src/app/api/unofficial-run/route.test.ts @@ -206,6 +206,36 @@ describe('normalizeArtifactRows', () => { ); expect(rows.every((r) => r.date === '2026-03-11')).toBe(true); }); + + it('surfaces the per-worker measured-power array on the BenchmarkRow', () => { + const workers = [ + { + role: 'prefill', + worker_idx: 0, + hosts: ['pn0'], + num_gpus: 4, + avg_power_w: 612.3, + avg_temp_c: 71.2, + }, + { + role: 'decode', + worker_idx: 0, + hosts: ['dn0', 'dn1'], + num_gpus: 8, + avg_power_w: 712.1, + }, + ]; + const rows = normalizeArtifactRows([rawRow({ workers })], '2026-03-01'); + expect(rows[0].workers).toHaveLength(2); + expect(rows[0].workers![0].hosts).toEqual(['pn0']); + expect(rows[0].workers![0].avg_temp_c).toBe(71.2); + expect(rows[0].workers![1].role).toBe('decode'); + }); + + it('leaves workers undefined when the artifact omits the field', () => { + const rows = normalizeArtifactRows([rawRow()], '2026-03-01'); + expect(rows[0].workers).toBeUndefined(); + }); }); describe('normalizeEvalArtifactRows', () => { diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index a90e26fc..072c99f1 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -55,6 +55,9 @@ export function normalizeArtifactRows( conc: params.conc, image: params.image, metrics: params.metrics, + // Surface the same per-worker payload the DB path emits so unofficial + // overlays carry the multinode measured-power breakdown too. + workers: params.workers, date, run_url: runUrl, }); diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index ab338b87..19887884 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -3,6 +3,50 @@ import type React from 'react'; import type { HardwareEntry } from '@/lib/constants'; import type { Model, Sequence } from '@/lib/data-mappings'; +/** + * Role of a single worker process in a multinode / disaggregated deployment. + * - `prefill` / `decode`: the two halves of a disaggregated serving setup + * - `agg`: an aggregated (non-disagg) worker that handles both phases + * - `frontend`: a router / load-balancer process (typically zero GPUs) + * + * Carried on `WorkerPower.role` as `string` (not the literal union) because + * the runner emits the role at the JSONB boundary — we can't statically + * guarantee the value at the type system level. Consumers that switch on the + * role should narrow via `if (role === 'prefill') ...` or a `WorkerRole` + * cast at the point of use. + */ +export type WorkerRole = 'prefill' | 'decode' | 'agg' | 'frontend'; + +/** + * Per-worker measured power entry emitted by the runner's aggregate_power.py + * for multinode and disaggregated runs. The chart layer can use these to + * surface a stacked breakdown of where energy is spent across worker types. + * + * `hosts` lists the node hostnames whose perfmon CSVs were rolled up into + * this worker entry (a single-node worker has one host; a multinode decode + * worker spanning 4 nodes has four). Optional because pre-multinode versions + * of aggregate_power.py didn't emit it. + * + * `avg_temp_c`, `peak_temp_c`, `avg_util_pct`, `avg_mem_used_mb` mirror the + * cluster-wide telemetry scalars and are only present when the perfmon CSVs + * include the corresponding sample columns. Each is optional so callers can + * distinguish "field absent from this run" from "field present and equal to 0". + */ +export interface WorkerPower { + // `string` rather than `WorkerRole` so the type lines up with what we get + // from the JSONB column without an unsafe cast at every boundary. Chart + // code can still narrow on the literal values it understands. + role: string; + worker_idx: number; + hosts?: string[]; + num_gpus: number; + avg_power_w: number; + avg_temp_c?: number; + peak_temp_c?: number; + avg_util_pct?: number; + avg_mem_used_mb?: number; +} + /** * Represents an aggregated data entry, typically from a raw data source. * This interface contains various performance metrics. @@ -72,6 +116,31 @@ export interface AggDataEntry { avg_power_w?: number; joules_per_output_token?: number; joules_per_total_token?: number; + // Multinode / disagg-only measured power. The aggregate_power.py runner + // emits per-role energy splits when the deployment has separate prefill + // and decode workers (single-node disagg or multinode disagg). Single-node + // aggregated configs leave these undefined. + // - prefill_avg_power_w / decode_avg_power_w: mean per-GPU draw (W) within each role + // - joules_per_input_token: prefill_energy / total_input_tokens (prefill GPUs only) + // The disagg decode-only J/output is carried by joules_per_output_token above + // (the runner overrides it to decode_energy / total_output_tokens on disagg) — + // there is no separate _decode field. + prefill_avg_power_w?: number; + decode_avg_power_w?: number; + joules_per_input_token?: number; + // Cluster-wide GPU telemetry beyond power (temperature, utilization, memory). + // Emitted by aggregate_power.py when the perfmon CSVs include the matching + // sample columns. Optional because older runs (and runs without the relevant + // perfmon samples) leave them unset — the chart layer must distinguish "no + // measurement" from "0". + avg_temp_c?: number; + peak_temp_c?: number; + avg_util_pct?: number; + avg_mem_used_mb?: number; + // Per-worker measured power breakdown. Each entry is one worker process + // (a prefill, decode, agg, or frontend role). Optional because pre-multinode + // and pre-aggregate_power.py runs don't emit it. + workers?: WorkerPower[]; disagg: boolean; num_prefill_gpu: number; num_decode_gpu: number; diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 1d7e8a3f..5bed8907 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -3,6 +3,8 @@ * Each function is a thin fetch wrapper returning typed data. */ +import type { WorkerPower } from '@/components/inference/types'; + import type { SubmissionsResponse } from './submissions-types'; export interface BenchmarkRow { @@ -28,6 +30,15 @@ export interface BenchmarkRow { conc: number; image: string | null; metrics: Record; + /** + * Per-worker measured power for multinode / disagg runs. The runner emits + * this as a JSONB sibling of the scalar metrics; the API layer surfaces it + * as a separate field here so the scalar `metrics` index signature can stay + * `Record` and existing `m.x ?? 0` call sites keep narrowing + * cleanly. Undefined for single-node runs and any run predating + * aggregate_power.py. + */ + workers?: WorkerPower[]; date: string; run_url: string | null; } diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index 42d2ed5a..b49fae39 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -133,6 +133,116 @@ describe('rowToAggDataEntry', () => { expect(entry.avg_power_w).toBeUndefined(); expect(entry.joules_per_output_token).toBeUndefined(); }); + + it('passes through multinode / disagg role-split power scalars when present', () => { + const entry = rowToAggDataEntry( + makeRow({ + metrics: { + tput_per_gpu: 100, + prefill_avg_power_w: 612.3, + decode_avg_power_w: 701.5, + joules_per_input_token: 1.2, + // disagg: joules_per_output_token IS the per-stage decode value. + joules_per_output_token: 9.7, + }, + }), + ); + expect(entry.prefill_avg_power_w).toBe(612.3); + expect(entry.decode_avg_power_w).toBe(701.5); + expect(entry.joules_per_input_token).toBe(1.2); + expect(entry.joules_per_output_token).toBe(9.7); + }); + + it('passes through per-worker measured power array intact', () => { + const workers = [ + { role: 'prefill' as const, worker_idx: 0, num_gpus: 4, avg_power_w: 588.4 }, + { role: 'prefill' as const, worker_idx: 1, num_gpus: 4, avg_power_w: 601.2 }, + { role: 'decode' as const, worker_idx: 0, num_gpus: 8, avg_power_w: 712.1 }, + { role: 'frontend' as const, worker_idx: 0, num_gpus: 0, avg_power_w: 0 }, + ]; + const entry = rowToAggDataEntry(makeRow({ workers })); + expect(entry.workers).toEqual(workers); + }); + + it('defensively drops a non-array workers payload', () => { + // The DB JSONB column is untyped at the wire boundary, so guard against a + // malformed row reaching downstream consumers. + const entry = rowToAggDataEntry( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + makeRow({ workers: 'oops' as any }), + ); + expect(entry.workers).toBeUndefined(); + }); + + it('leaves multinode role-split scalars and workers undefined for legacy rows', () => { + // Single-node configs predating the multinode runner don't emit any of + // the role-split fields; transform must yield undefined (not 0) so the + // chart layer can distinguish "no measurement" from a real zero. + const entry = rowToAggDataEntry(makeRow({ metrics: {} })); + expect(entry.prefill_avg_power_w).toBeUndefined(); + expect(entry.decode_avg_power_w).toBeUndefined(); + expect(entry.joules_per_input_token).toBeUndefined(); + expect(entry.workers).toBeUndefined(); + }); + + it('passes through cluster-wide temp/util/mem scalars when present', () => { + const entry = rowToAggDataEntry( + makeRow({ + metrics: { + tput_per_gpu: 100, + avg_temp_c: 68.4, + peak_temp_c: 79.2, + avg_util_pct: 88.5, + avg_mem_used_mb: 71234.5, + }, + }), + ); + expect(entry.avg_temp_c).toBe(68.4); + expect(entry.peak_temp_c).toBe(79.2); + expect(entry.avg_util_pct).toBe(88.5); + expect(entry.avg_mem_used_mb).toBe(71234.5); + }); + + it('leaves cluster-wide temp/util/mem fields undefined when absent (legacy rows)', () => { + // Same undefined-vs-zero distinction as the measured-power scalars — + // historic rows predate the perfmon CSV scrape, so missing values must + // not be silently coerced to 0. + const entry = rowToAggDataEntry(makeRow({ metrics: {} })); + expect(entry.avg_temp_c).toBeUndefined(); + expect(entry.peak_temp_c).toBeUndefined(); + expect(entry.avg_util_pct).toBeUndefined(); + expect(entry.avg_mem_used_mb).toBeUndefined(); + }); + + it('preserves new optional WorkerPower fields (hosts, telemetry) on workers entries', () => { + const workers = [ + { + role: 'prefill' as const, + worker_idx: 0, + hosts: ['pn0'], + num_gpus: 4, + avg_power_w: 612.3, + avg_temp_c: 71.2, + peak_temp_c: 78, + avg_util_pct: 92.1, + avg_mem_used_mb: 65432, + }, + { + role: 'decode' as const, + worker_idx: 0, + hosts: ['dn0', 'dn1', 'dn2', 'dn3'], + num_gpus: 16, + avg_power_w: 712.1, + }, + ]; + const entry = rowToAggDataEntry(makeRow({ workers })); + expect(entry.workers).toEqual(workers); + expect(entry.workers![0].hosts).toEqual(['pn0']); + expect(entry.workers![0].avg_temp_c).toBe(71.2); + expect(entry.workers![1].hosts).toEqual(['dn0', 'dn1', 'dn2', 'dn3']); + // Optional telemetry fields stay undefined when source omits them. + expect(entry.workers![1].avg_temp_c).toBeUndefined(); + }); }); describe('transformBenchmarkRows', () => { diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 8b4742e1..87b48558 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -55,6 +55,23 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { avg_power_w: m.avg_power_w, joules_per_output_token: m.joules_per_output_token, joules_per_total_token: m.joules_per_total_token, + // Multinode / disagg-only role splits — same undefined-for-legacy pattern. + // (disagg's decode-only J/output is carried by joules_per_output_token above, + // which the runner overrides to the per-stage value — no separate _decode key.) + prefill_avg_power_w: m.prefill_avg_power_w, + decode_avg_power_w: m.decode_avg_power_w, + joules_per_input_token: m.joules_per_input_token, + // Cluster-wide GPU telemetry beyond power. Emitted when the perfmon CSVs + // include the corresponding sample columns; left undefined otherwise so + // the chart layer can distinguish "no measurement" from a real zero. + avg_temp_c: m.avg_temp_c, + peak_temp_c: m.peak_temp_c, + avg_util_pct: m.avg_util_pct, + avg_mem_used_mb: m.avg_mem_used_mb, + // Per-worker measured power. Surfaced on BenchmarkRow as a sibling of the + // scalar `metrics` dict (see api.ts). Narrow defensively so a malformed + // payload can't poison downstream consumers. + workers: Array.isArray(row.workers) ? row.workers : undefined, disagg: row.disagg, num_prefill_gpu: row.num_prefill_gpu, num_decode_gpu: row.num_decode_gpu, diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts index 037d7df4..7fa88c97 100644 --- a/packages/constants/src/metric-keys.ts +++ b/packages/constants/src/metric-keys.ts @@ -45,10 +45,35 @@ export const METRIC_KEYS = new Set([ 'std_intvty', // measured power / energy (emitted by runner's aggregate_power.py) // avg_power_w: mean per-GPU draw (W) during the load window - // joules_per_output_token: avg_power_w * num_gpus * duration / total_output_tokens - // joules_per_total_token: avg_power_w * num_gpus * duration / (total_input + total_output) - // — workload-shape-fair view that doesn't treat prompt as free + // joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE on + // single-node / non-disagg (total_system_energy); + // PER-STAGE decode_energy on disagg (decode GPUs only), + // symmetric with joules_per_input_token below. + // joules_per_total_token: total_system_energy / (total_input + total_output) + // — cluster-wide; workload-shape-fair view that + // doesn't treat prompt as free. 'avg_power_w', 'joules_per_output_token', 'joules_per_total_token', + // multinode / disagg role splits (emitted only when the deployment has + // distinct prefill / decode workers) + // prefill_avg_power_w / decode_avg_power_w: mean per-GPU draw within each role + // joules_per_input_token: prefill_energy / total_input_tokens (prefill GPUs only). + // The disagg output counterpart is joules_per_output_token above (decode GPUs + // only) — there is no separate _decode key. + 'prefill_avg_power_w', + 'decode_avg_power_w', + 'joules_per_input_token', + // cluster-wide GPU telemetry beyond power (emitted by aggregate_power.py when + // the perfmon CSVs include temperature, utilization, or memory samples). + // avg_temp_c: mean per-GPU temperature (Celsius) during load window + // peak_temp_c: max instantaneous per-GPU temperature in window + // avg_util_pct: mean per-GPU GPU-utilization percent (0-100) + // avg_mem_used_mb: mean per-GPU memory used (MiB / MB) + // Single-node and multinode runs both surface these as flat scalars; the + // per-worker breakdown carries the same fields on each entry in workers[]. + 'avg_temp_c', + 'peak_temp_c', + 'avg_util_pct', + 'avg_mem_used_mb', ]); diff --git a/packages/db/migrations/006_benchmark_results_workers.sql b/packages/db/migrations/006_benchmark_results_workers.sql new file mode 100644 index 00000000..28b84acb --- /dev/null +++ b/packages/db/migrations/006_benchmark_results_workers.sql @@ -0,0 +1,42 @@ +-- ============================================================ +-- BENCHMARK_RESULTS — per-worker measured power breakdown +-- ============================================================ +-- +-- Multinode and disaggregated runs emit a per-worker telemetry array from the +-- runner's aggregate_power.py — one entry per prefill/decode/agg/frontend +-- worker with {role, worker_idx, hosts[], num_gpus, avg_power_w, ...optional +-- temp/util/mem fields}. We keep this in a separate JSONB column rather than +-- stuffing it into `metrics` because: +-- +-- 1. metrics is a flat Record: every API consumer (and the +-- benchmark-mapper warning) assumes scalar values. An array of objects +-- under one key would break parseNum and surface as "missing" everywhere. +-- 2. workers is large (one entry per worker, potentially dozens on a wide +-- multinode disagg run) and only used by a narrow set of features. +-- Keeping it in its own column lets future queries skip the field when +-- a SELECT doesn't need it. +-- +-- Null for single-node runs (which don't have per-worker splits) and any +-- benchmark predating the aggregate_power.py multinode patch. + +alter table benchmark_results + add column workers jsonb; + +-- Re-create the latest_benchmarks materialized view so the new column rides +-- on the view as well. SELECT * in the original definition would not pick up +-- columns added after the view was created, so DROP + CREATE is the cleanest +-- path. The view's pre-existing indexes are also re-created. + +drop materialized view if exists latest_benchmarks; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, + br.date desc, wr.run_started_at desc nulls last; + +create unique index latest_benchmarks_pk on latest_benchmarks (config_id, conc, isl, osl); +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index 67173c64..a5493629 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -40,11 +40,17 @@ export async function bulkIngestBenchmarkRows( const concs = deduped.map((r) => r.conc); const images = deduped.map((r) => r.image); const metricsJsons = deduped.map((r) => JSON.stringify(r.metrics)); + // workers is optional — encode missing values as JSON null so the JSONB + // unnest input has a homogeneous type (jsonb[]) and stores SQL NULL in the + // column for rows that didn't emit a per-worker breakdown. + const workersJsons = deduped.map((r) => + r.workers === undefined ? null : JSON.stringify(r.workers), + ); const result = await sql<{ inserted: boolean; id: number }[]>` insert into benchmark_results ( workflow_run_id, config_id, benchmark_type, date, - isl, osl, conc, image, metrics + isl, osl, conc, image, metrics, workers ) select ${workflowRunId}, @@ -55,11 +61,13 @@ export async function bulkIngestBenchmarkRows( unnest(${sql.array(osls)}::int[]), unnest(${sql.array(concs)}::int[]), unnest(${sql.array(images)}), - unnest(${sql.array(metricsJsons)}::jsonb[]) + unnest(${sql.array(metricsJsons)}::jsonb[]), + unnest(${sql.array(workersJsons)}::jsonb[]) on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc) do update set metrics = excluded.metrics, - image = excluded.image + image = excluded.image, + workers = excluded.workers returning (xmax = 0) as inserted, id `; diff --git a/packages/db/src/etl/benchmark-mapper.test.ts b/packages/db/src/etl/benchmark-mapper.test.ts index 6c233de4..65fb3e39 100644 --- a/packages/db/src/etl/benchmark-mapper.test.ts +++ b/packages/db/src/etl/benchmark-mapper.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { mapBenchmarkRow } from './benchmark-mapper'; +import { extractWorkers, mapBenchmarkRow } from './benchmark-mapper'; import { createSkipTracker } from './skip-tracker'; /** Minimal valid v1 benchmark row. */ @@ -388,4 +388,185 @@ describe('mapBenchmarkRow', () => { expect(result!.config.isMultinode).toBe(false); }); }); + + describe('workers payload (multinode / disagg measured power)', () => { + it('leaves workers undefined when the row omits the field', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV1Row(), tracker); + + expect(result!.workers).toBeUndefined(); + }); + + it('extracts a multinode disagg workers array intact', () => { + const tracker = createSkipTracker(); + const workers = [ + { + role: 'prefill', + worker_idx: 0, + hosts: ['pn0'], + num_gpus: 4, + avg_power_w: 612.3, + avg_temp_c: 71.2, + peak_temp_c: 78, + avg_util_pct: 92.1, + avg_mem_used_mb: 65432, + }, + { + role: 'decode', + worker_idx: 0, + hosts: ['dn0', 'dn1', 'dn2', 'dn3'], + num_gpus: 16, + avg_power_w: 712.1, + }, + ]; + const result = mapBenchmarkRow(makeV2Row({ workers }), tracker); + + expect(result!.workers).toHaveLength(2); + expect(result!.workers![0].role).toBe('prefill'); + expect(result!.workers![0].hosts).toEqual(['pn0']); + expect(result!.workers![0].avg_power_w).toBe(612.3); + expect(result!.workers![0].avg_temp_c).toBe(71.2); + expect(result!.workers![0].peak_temp_c).toBe(78); + expect(result!.workers![0].avg_util_pct).toBe(92.1); + expect(result!.workers![0].avg_mem_used_mb).toBe(65432); + expect(result!.workers![1].role).toBe('decode'); + expect(result!.workers![1].hosts).toEqual(['dn0', 'dn1', 'dn2', 'dn3']); + expect(result!.workers![1].num_gpus).toBe(16); + // Optional telemetry fields stay absent when the source omits them. + expect(result!.workers![1].avg_temp_c).toBeUndefined(); + }); + + it('does not write workers into the metrics record', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeV1Row({ + workers: [{ role: 'prefill', worker_idx: 0, num_gpus: 4, avg_power_w: 500 }], + }), + tracker, + ); + + // workers is an array — parseNum yields undefined, but the explicit + // NON_METRIC_KEYS entry is what guarantees it never leaks into metrics. + expect(result!.metrics).not.toHaveProperty('workers'); + }); + + it('captures new cluster-wide temp / util / mem scalars into metrics', () => { + // These are flat scalars on the agg row (sibling of avg_power_w), so + // the auto-capture path must store them under their raw keys without + // emitting a "[WARN] auto-captured unexpected metric" warning. The + // warning suppression is verified indirectly: METRIC_KEYS now contains + // these keys, so a clean test run never produces a warning. + const tracker = createSkipTracker(); + const result = mapBenchmarkRow( + makeV1Row({ + avg_temp_c: 68.4, + peak_temp_c: 79.2, + avg_util_pct: 88.5, + avg_mem_used_mb: 71234.5, + }), + tracker, + ); + + expect(result!.metrics.avg_temp_c).toBe(68.4); + expect(result!.metrics.peak_temp_c).toBe(79.2); + expect(result!.metrics.avg_util_pct).toBe(88.5); + expect(result!.metrics.avg_mem_used_mb).toBe(71234.5); + }); + + it('backward-compat: row without temp/util/mem still maps cleanly', () => { + const tracker = createSkipTracker(); + const result = mapBenchmarkRow(makeV1Row(), tracker); + + expect(result).not.toBeNull(); + expect(result!.metrics).not.toHaveProperty('avg_temp_c'); + expect(result!.metrics).not.toHaveProperty('peak_temp_c'); + expect(result!.metrics).not.toHaveProperty('avg_util_pct'); + expect(result!.metrics).not.toHaveProperty('avg_mem_used_mb'); + expect(result!.workers).toBeUndefined(); + }); + }); +}); + +describe('extractWorkers', () => { + it('returns undefined for non-array input', () => { + expect(extractWorkers(undefined)).toBeUndefined(); + expect(extractWorkers(null)).toBeUndefined(); + expect(extractWorkers('not-an-array')).toBeUndefined(); + expect(extractWorkers(42)).toBeUndefined(); + expect(extractWorkers({ role: 'prefill' })).toBeUndefined(); + }); + + it('returns undefined for an empty array', () => { + expect(extractWorkers([])).toBeUndefined(); + }); + + it('keeps well-formed entries and drops malformed ones', () => { + const result = extractWorkers([ + { role: 'prefill', worker_idx: 0, num_gpus: 4, avg_power_w: 500 }, + // missing role + { worker_idx: 1, num_gpus: 4, avg_power_w: 500 }, + // missing avg_power_w + { role: 'decode', worker_idx: 0, num_gpus: 4 }, + // ok + { role: 'frontend', worker_idx: 0, num_gpus: 0, avg_power_w: 0 }, + ]); + + expect(result).toHaveLength(2); + expect(result![0].role).toBe('prefill'); + expect(result![1].role).toBe('frontend'); + expect(result![1].avg_power_w).toBe(0); + }); + + it('coerces string-numeric values via parseNum / parseInt2', () => { + const result = extractWorkers([ + { + role: 'decode', + worker_idx: '2', + num_gpus: '8', + avg_power_w: '712.5', + avg_temp_c: '71.5', + }, + ]); + + expect(result).toHaveLength(1); + expect(result![0].worker_idx).toBe(2); + expect(result![0].num_gpus).toBe(8); + expect(result![0].avg_power_w).toBe(712.5); + expect(result![0].avg_temp_c).toBe(71.5); + }); + + it('drops hosts when it is not an all-string array', () => { + const result = extractWorkers([ + { + role: 'prefill', + worker_idx: 0, + num_gpus: 4, + avg_power_w: 500, + hosts: 'pn0', + }, + { + role: 'decode', + worker_idx: 0, + num_gpus: 4, + avg_power_w: 500, + hosts: ['dn0', 42], + }, + { + role: 'agg', + worker_idx: 0, + num_gpus: 4, + avg_power_w: 500, + hosts: ['ok'], + }, + ]); + + expect(result).toHaveLength(3); + expect(result![0].hosts).toBeUndefined(); + expect(result![1].hosts).toBeUndefined(); + expect(result![2].hosts).toEqual(['ok']); + }); + + it('returns undefined when every entry is malformed', () => { + expect(extractWorkers([null, 'bad', 0, undefined])).toBeUndefined(); + }); }); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 7d78e175..b25baf60 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -57,6 +57,10 @@ const NON_METRIC_KEYS = new Set([ 'decode_num_workers', 'num_prefill_gpu', 'num_decode_gpu', + // per-worker measured-power array (not a numeric scalar). Surfaced as a + // sibling of the metrics JSONB by mapBenchmarkRow so the metrics column + // stays Record for the index signature on BenchmarkRow. + 'workers', ]); /** @@ -68,6 +72,23 @@ const NON_METRIC_KEYS = new Set([ // Deduplicate warnings: each unexpected key only prints once per process. const _warnedMetricKeys = new Set(); +/** + * One per-worker entry from aggregate_power.py's `workers` array. + * Fields after `avg_power_w` are optional because the perfmon CSVs may not + * include the corresponding sample columns on every run. + */ +export interface WorkerPower { + role: string; + worker_idx: number; + hosts?: string[]; + num_gpus: number; + avg_power_w: number; + avg_temp_c?: number; + peak_temp_c?: number; + avg_util_pct?: number; + avg_mem_used_mb?: number; +} + export interface BenchmarkParams { config: ConfigParams; isl: number; @@ -75,6 +96,15 @@ export interface BenchmarkParams { conc: number; image: string | null; metrics: Record; + /** + * Per-worker measured-power breakdown emitted by the runner's + * aggregate_power.py on multinode / disagg runs. Stored on + * benchmark_results in a dedicated JSONB column (added in migration 006) + * rather than inside `metrics` so the metrics index signature can stay + * `Record`. Undefined for single-node runs and any run + * predating the multinode patch. + */ + workers?: WorkerPower[]; } /** @@ -185,6 +215,13 @@ export function mapBenchmarkRow( // Artifact names encode '/' as '#' to avoid path separators; restore the URI. const image = row.image ? String(row.image).replaceAll('#', '/') : null; + // Per-worker measured-power breakdown. The runner emits this as an array + // of objects sibling to the scalar metrics; we surface it on a dedicated + // BenchmarkParams.workers field so downstream consumers can treat it as + // structured data without polluting the flat metrics record. Defensive + // narrowing — anything other than a non-empty array of objects is dropped. + const workers = extractWorkers(row.workers); + return { config: { hardware: gpuKey, @@ -210,5 +247,50 @@ export function mapBenchmarkRow( conc, image, metrics, + workers, }; } + +/** + * Narrow a raw `workers` value from the artifact JSON to `WorkerPower[]` or + * undefined. Each entry must have a string `role`, a numeric `worker_idx`, + * `num_gpus`, and `avg_power_w` to be kept; anything else is dropped. Optional + * telemetry scalars (`avg_temp_c`, `peak_temp_c`, `avg_util_pct`, + * `avg_mem_used_mb`) and the `hosts[]` list are preserved when present and + * well-typed, ignored otherwise. Returns undefined for any non-array input or + * an empty array so the eventual JSONB column stores null rather than `[]`. + */ +export function extractWorkers(raw: unknown): WorkerPower[] | undefined { + if (!Array.isArray(raw) || raw.length === 0) return undefined; + const out: WorkerPower[] = []; + for (const entry of raw) { + if (!entry || typeof entry !== 'object') continue; + const e = entry as Record; + const role = typeof e.role === 'string' ? e.role : null; + const worker_idx = parseInt2(e.worker_idx); + const num_gpus = parseInt2(e.num_gpus); + const avg_power_w = parseNum(e.avg_power_w); + if ( + role === null || + worker_idx === undefined || + num_gpus === undefined || + avg_power_w === undefined + ) + continue; + + const w: WorkerPower = { role, worker_idx, num_gpus, avg_power_w }; + if (Array.isArray(e.hosts) && e.hosts.every((h) => typeof h === 'string')) { + w.hosts = e.hosts as string[]; + } + const avg_temp_c = parseNum(e.avg_temp_c); + if (avg_temp_c !== undefined) w.avg_temp_c = avg_temp_c; + const peak_temp_c = parseNum(e.peak_temp_c); + if (peak_temp_c !== undefined) w.peak_temp_c = peak_temp_c; + const avg_util_pct = parseNum(e.avg_util_pct); + if (avg_util_pct !== undefined) w.avg_util_pct = avg_util_pct; + const avg_mem_used_mb = parseNum(e.avg_mem_used_mb); + if (avg_mem_used_mb !== undefined) w.avg_mem_used_mb = avg_mem_used_mb; + out.push(w); + } + return out.length > 0 ? out : undefined; +} diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index 857546a5..6c7c78b8 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -12,7 +12,7 @@ import { existsSync, readFileSync } from 'node:fs'; import { dirname, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; -import type { BenchmarkRow } from './queries/benchmarks.js'; +import type { BenchmarkRow, BenchmarkWorkerRow } from './queries/benchmarks.js'; import type { EvalRow } from './queries/evaluations.js'; import type { ReliabilityRow } from './queries/reliability.js'; import type { @@ -73,6 +73,8 @@ interface RawBenchmarkResult { conc: number; image: string | null; metrics: Record; + /** Added in migration 006; older dumps omit this field — surfaced as undefined. */ + workers?: BenchmarkWorkerRow[] | null; error: string | null; server_log_id: number | null; } @@ -300,6 +302,10 @@ function toBenchmarkRow( conc: br.conc, image: br.image, metrics: metrics ?? br.metrics, + // workers: optional sibling JSONB column. Older dumps (pre-migration 006) + // simply lack the field — defensively narrow to an array or undefined so + // downstream consumers can rely on the property being well-typed. + workers: Array.isArray(br.workers) ? br.workers : undefined, date: toDateString(br.date), run_url: buildRunUrl(wr), }; diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 1c30b1fd..a2167e95 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -1,4 +1,14 @@ import type { DbClient } from '../connection.js'; +import type { WorkerPower } from '../etl/benchmark-mapper.js'; + +/** + * One entry in `BenchmarkRow.workers` — mirrors the runner's aggregate_power.py + * per-worker payload. Structurally identical to the ingest-side {@link WorkerPower}, + * so it is aliased to that single definition rather than redeclared, keeping the + * shape from drifting within this package. The read side keeps the + * `BenchmarkWorkerRow` name it's referenced by (json-provider, BenchmarkRow.workers). + */ +export type BenchmarkWorkerRow = WorkerPower; export interface BenchmarkRow { hardware: string; @@ -23,6 +33,13 @@ export interface BenchmarkRow { conc: number; image: string | null; metrics: Record; + /** + * Per-worker measured-power breakdown emitted on multinode / disagg runs. + * Stored in the dedicated `workers` JSONB column on `benchmark_results` + * (added in migration 006). Null for single-node runs and any run predating + * aggregate_power.py's multinode patch — surfaced as undefined here. + */ + workers?: BenchmarkWorkerRow[]; date: string; run_url: string | null; } @@ -73,6 +90,7 @@ export async function getLatestBenchmarks( br.conc, br.image, br.metrics, + br.workers, br.date::text, CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url FROM benchmark_results br @@ -111,6 +129,7 @@ export async function getLatestBenchmarks( lb.conc, lb.image, lb.metrics, + lb.workers, lb.date::text, CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url FROM latest_benchmarks lb @@ -157,6 +176,7 @@ export async function getAllBenchmarksForHistory( br.osl, br.conc, br.metrics - '{std_ttft,std_tpot,std_e2el,std_intvty,std_itl,mean_ttft,mean_tpot,mean_e2el,mean_intvty,mean_itl}'::text[] as metrics, + br.workers, br.date::text, CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url FROM configs c