Skip to content

Commit 7b8712c

Browse files
committed
Health checks for deployment to gate admission
1 parent 938272e commit 7b8712c

File tree

6 files changed

+762
-67
lines changed

6 files changed

+762
-67
lines changed
Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
#!/usr/bin/env bun
2+
3+
/**
4+
* Fetch and render Fireworks deployment health + runtime stats.
5+
*
6+
* Data sources:
7+
* - GET /v1/accounts/{account}/deployments (list / per-deployment state)
8+
* - GET /v1/accounts/{account}/metrics (Prometheus text, all deployments)
9+
*
10+
* Usage:
11+
* bun scripts/fireworks-deployment-stats.ts # all deployments in the account
12+
* bun scripts/fireworks-deployment-stats.ts <deployment_id> # filter to one deployment
13+
*
14+
* Env:
15+
* FIREWORKS_API_KEY (required) — auto-loaded from .env.local via bun
16+
* FIREWORKS_ACCOUNT_ID (optional) — defaults to the account in fireworks-config.ts
17+
*/
18+
19+
import { FIREWORKS_ACCOUNT_ID } from '../web/src/llm-api/fireworks-config'
20+
21+
const API_BASE = 'https://api.fireworks.ai/v1'
22+
23+
type Deployment = {
24+
name: string
25+
baseModel: string
26+
state: string
27+
status: { code: string; message: string }
28+
replicaCount: number
29+
desiredReplicaCount: number
30+
minReplicaCount: number
31+
maxReplicaCount: number
32+
replicaStats: {
33+
readyReplicaCount: number
34+
initializingReplicaCount: number
35+
pendingSchedulingReplicaCount: number
36+
downloadingModelReplicaCount: number
37+
}
38+
createTime: string
39+
updateTime: string
40+
deploymentShape: string
41+
autoscalingPolicy: {
42+
loadTargets: Record<string, number>
43+
scaleUpWindow: string
44+
scaleDownWindow: string
45+
scaleToZeroWindow: string
46+
}
47+
}
48+
49+
type PromSample = { name: string; labels: Record<string, string>; value: number }
50+
51+
const HISTOGRAM_METRICS = [
52+
{ key: 'latency_to_first_token_ms', label: 'TTFT (ms)' },
53+
{ key: 'latency_prefill_ms', label: 'prefill (ms)' },
54+
{ key: 'latency_prefill_queue_ms', label: 'prefill-queue (ms)' },
55+
{ key: 'latency_generation_queue_ms', label: 'gen-queue (ms)' },
56+
{ key: 'latency_generation_per_token_ms', label: 'inter-token (ms)' },
57+
{ key: 'latency_overall_ms', label: 'overall (ms)' },
58+
{ key: 'tokens_prompt_per_request', label: 'prompt toks/req' },
59+
{ key: 'tokens_generated_per_request', label: 'gen toks/req' },
60+
] as const
61+
62+
async function fetchDeployments(apiKey: string, accountId: string): Promise<Deployment[]> {
63+
const res = await fetch(`${API_BASE}/accounts/${accountId}/deployments`, {
64+
headers: { Authorization: `Bearer ${apiKey}` },
65+
})
66+
if (!res.ok) throw new Error(`Deployments list ${res.status}: ${await res.text()}`)
67+
const data = (await res.json()) as { deployments: Deployment[] }
68+
return data.deployments ?? []
69+
}
70+
71+
async function fetchPrometheusMetrics(apiKey: string, accountId: string): Promise<PromSample[]> {
72+
const res = await fetch(`${API_BASE}/accounts/${accountId}/metrics`, {
73+
headers: { Authorization: `Bearer ${apiKey}` },
74+
})
75+
if (!res.ok) throw new Error(`Metrics ${res.status}: ${await res.text()}`)
76+
const text = await res.text()
77+
return parsePrometheus(text)
78+
}
79+
80+
function parsePrometheus(text: string): PromSample[] {
81+
const samples: PromSample[] = []
82+
for (const line of text.split('\n')) {
83+
if (!line || line.startsWith('#')) continue
84+
const braceStart = line.indexOf('{')
85+
const braceEnd = line.indexOf('}')
86+
let name: string
87+
let labelStr = ''
88+
let rest: string
89+
if (braceStart === -1) {
90+
const parts = line.split(/\s+/)
91+
name = parts[0]
92+
rest = parts.slice(1).join(' ')
93+
} else {
94+
name = line.slice(0, braceStart)
95+
labelStr = line.slice(braceStart + 1, braceEnd)
96+
rest = line.slice(braceEnd + 1).trim()
97+
}
98+
const valueToken = rest.split(/\s+/)[0]
99+
const value = Number(valueToken)
100+
if (!Number.isFinite(value)) continue
101+
const labels: Record<string, string> = {}
102+
if (labelStr) {
103+
const re = /(\w+)="((?:[^"\\]|\\.)*)"/g
104+
let m: RegExpExecArray | null
105+
while ((m = re.exec(labelStr)) !== null) labels[m[1]] = m[2]
106+
}
107+
samples.push({ name, labels, value })
108+
}
109+
return samples
110+
}
111+
112+
function scalarFor(samples: PromSample[], name: string, deploymentId: string): number | undefined {
113+
return samples.find((s) => s.name === name && s.labels.deployment_id === deploymentId)?.value
114+
}
115+
116+
function bucketPercentiles(
117+
samples: PromSample[],
118+
metricKey: string,
119+
deploymentId: string,
120+
percentiles: number[] = [50, 90, 95, 99],
121+
): { total: number; values: Record<number, number> } | null {
122+
const buckets = samples
123+
.filter(
124+
(s) => s.name === `${metricKey}_bucket:sum_by_deployment` && s.labels.deployment_id === deploymentId,
125+
)
126+
.map((s) => ({
127+
le: s.labels.le === '+Inf' ? Number.POSITIVE_INFINITY : Number(s.labels.le),
128+
cum: s.value,
129+
}))
130+
.sort((a, b) => a.le - b.le)
131+
132+
if (buckets.length === 0) return null
133+
const total = buckets[buckets.length - 1].cum
134+
if (total === 0) return { total, values: Object.fromEntries(percentiles.map((p) => [p, 0])) }
135+
136+
const values: Record<number, number> = {}
137+
for (const p of percentiles) {
138+
const target = total * (p / 100)
139+
let prevLe = 0
140+
let prevCum = 0
141+
let picked = Number.POSITIVE_INFINITY
142+
for (const { le, cum } of buckets) {
143+
if (cum >= target) {
144+
if (!Number.isFinite(le)) {
145+
picked = prevLe
146+
} else if (cum === prevCum) {
147+
picked = le
148+
} else {
149+
const frac = (target - prevCum) / (cum - prevCum)
150+
picked = prevLe + frac * (le - prevLe)
151+
}
152+
break
153+
}
154+
prevLe = le
155+
prevCum = cum
156+
}
157+
values[p] = picked
158+
}
159+
return { total, values }
160+
}
161+
162+
function fmt(n: number | undefined, digits = 0): string {
163+
if (n === undefined || !Number.isFinite(n)) return '—'
164+
if (Math.abs(n) >= 1000) return n.toFixed(0)
165+
return n.toFixed(digits)
166+
}
167+
168+
function fmtPct(n: number | undefined): string {
169+
return n === undefined ? '—' : `${(n * 100).toFixed(1)}%`
170+
}
171+
172+
function parseDuration(d: string): string {
173+
const match = /^([\d.]+)s$/.exec(d)
174+
if (!match) return d
175+
const secs = Number(match[1])
176+
if (secs >= 60) return `${(secs / 60).toFixed(0)}m`
177+
return `${secs}s`
178+
}
179+
180+
function renderDeployment(d: Deployment, samples: PromSample[]): void {
181+
const deploymentId = d.name.split('/').pop()!
182+
const shape = d.deploymentShape.split('/').slice(-3, -2)[0] ?? d.deploymentShape
183+
184+
const stateIcon = d.state === 'READY' ? '✅' : d.state === 'UPDATING' ? '🔄' : '⚠️'
185+
186+
console.log('━'.repeat(80))
187+
console.log(`${stateIcon} ${d.name}`)
188+
console.log(` model=${d.baseModel} shape=${shape}`)
189+
console.log(
190+
` state=${d.state} (${d.status.code}) replicas ready=${d.replicaStats.readyReplicaCount}/${d.replicaCount} ` +
191+
`min=${d.minReplicaCount} max=${d.maxReplicaCount}`,
192+
)
193+
const p = d.autoscalingPolicy
194+
console.log(
195+
` autoscale target=${p.loadTargets.default} up=${parseDuration(p.scaleUpWindow)} ` +
196+
`down=${parseDuration(p.scaleDownWindow)} to-zero=${parseDuration(p.scaleToZeroWindow)}`,
197+
)
198+
console.log(` updated=${d.updateTime}`)
199+
200+
const kvBlocks = scalarFor(samples, 'generator_kv_blocks_fraction:avg_by_deployment', deploymentId)
201+
const kvSlots = scalarFor(samples, 'generator_kv_slots_fraction:avg_by_deployment', deploymentId)
202+
const active = scalarFor(samples, 'generator_num_active_fraction:avg_by_deployment', deploymentId)
203+
const fwdTime = scalarFor(samples, 'generator_model_forward_time:avg_by_deployment', deploymentId)
204+
205+
const reqRate = scalarFor(samples, 'request_counter_total:sum_by_deployment', deploymentId)
206+
const promptTokRate = scalarFor(samples, 'tokens_prompt_total:sum_by_deployment', deploymentId)
207+
const cachedPromptRate = scalarFor(samples, 'tokens_cached_prompt_total:sum_by_deployment', deploymentId)
208+
const genTokGauge = scalarFor(samples, 'tokens_generated_gauge:sum_by_deployment', deploymentId)
209+
const err400 = samples.find(
210+
(s) =>
211+
s.name === 'requests_error_total:sum_by_deployment' &&
212+
s.labels.deployment_id === deploymentId &&
213+
s.labels.code === '400',
214+
)?.value
215+
const err500 = samples.find(
216+
(s) =>
217+
s.name === 'requests_error_total:sum_by_deployment' &&
218+
s.labels.deployment_id === deploymentId &&
219+
s.labels.code === '500',
220+
)?.value
221+
222+
const cacheHitRate =
223+
promptTokRate && promptTokRate > 0 && cachedPromptRate !== undefined
224+
? cachedPromptRate / promptTokRate
225+
: undefined
226+
const errRate400 =
227+
reqRate && reqRate > 0 && err400 !== undefined ? err400 / reqRate : undefined
228+
229+
console.log('\n GPU / capacity')
230+
console.log(
231+
` kv_blocks=${fmtPct(kvBlocks)} kv_slots=${fmtPct(kvSlots)} ` +
232+
`active_generators=${fmt(active, 2)} fwd_time=${fmt((fwdTime ?? 0) * 1000, 1)}ms`,
233+
)
234+
235+
console.log('\n Throughput (per-sec rates)')
236+
console.log(
237+
` requests=${fmt(reqRate, 2)}/s prompt_tokens=${fmt(promptTokRate)}/s ` +
238+
`cached_prompt=${fmt(cachedPromptRate)}/s cache_hit=${fmtPct(cacheHitRate)} ` +
239+
`generated_gauge=${fmt(genTokGauge, 1)}`,
240+
)
241+
242+
console.log('\n Errors (per-sec)')
243+
console.log(
244+
` 400=${fmt(err400 ?? 0, 3)}/s (${fmtPct(errRate400)}) 500=${fmt(err500 ?? 0, 3)}/s`,
245+
)
246+
247+
console.log('\n Latency & size percentiles')
248+
console.log(
249+
` ${'metric'.padEnd(22)} ${'events'.padStart(9)} ${'p50'.padStart(9)} ${'p90'.padStart(9)} ${'p95'.padStart(9)} ${'p99'.padStart(9)}`,
250+
)
251+
for (const h of HISTOGRAM_METRICS) {
252+
const pct = bucketPercentiles(samples, h.key, deploymentId)
253+
if (!pct) {
254+
console.log(` ${h.label.padEnd(22)} ${'—'.padStart(9)}`)
255+
continue
256+
}
257+
console.log(
258+
` ${h.label.padEnd(22)} ${fmt(pct.total, 2).padStart(9)} ` +
259+
`${fmt(pct.values[50]).padStart(9)} ${fmt(pct.values[90]).padStart(9)} ` +
260+
`${fmt(pct.values[95]).padStart(9)} ${fmt(pct.values[99]).padStart(9)}`,
261+
)
262+
}
263+
console.log()
264+
}
265+
266+
async function main() {
267+
const apiKey = process.env.FIREWORKS_API_KEY
268+
if (!apiKey || apiKey === 'dummy_fireworks_key') {
269+
console.error('FIREWORKS_API_KEY not set (check .env.local)')
270+
process.exit(1)
271+
}
272+
const accountId = process.env.FIREWORKS_ACCOUNT_ID ?? FIREWORKS_ACCOUNT_ID
273+
const filter = process.argv[2]
274+
275+
const [deployments, samples] = await Promise.all([
276+
fetchDeployments(apiKey, accountId),
277+
fetchPrometheusMetrics(apiKey, accountId),
278+
])
279+
280+
const filtered = filter
281+
? deployments.filter((d) => d.name.endsWith(`/${filter}`) || d.name === filter)
282+
: deployments
283+
284+
if (filtered.length === 0) {
285+
console.error(`No deployments matched${filter ? ` "${filter}"` : ''} in account ${accountId}`)
286+
process.exit(1)
287+
}
288+
289+
console.log(`Fireworks account: ${accountId}${filtered.length} deployment(s)`)
290+
console.log(`Rates below are per-second (Prometheus recording rules; ~30s update cadence).`)
291+
console.log()
292+
293+
for (const d of filtered) renderDeployment(d, samples)
294+
}
295+
296+
main().catch((err) => {
297+
console.error(err)
298+
process.exit(1)
299+
})

web/src/server/free-session/__tests__/admission.test.ts

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { describe, expect, test } from 'bun:test'
33
import { runAdmissionTick } from '../admission'
44

55
import type { AdmissionDeps } from '../admission'
6+
import type { FireworksHealth } from '../fireworks-health'
67

78
const NOW = new Date('2026-04-17T12:00:00Z')
89

@@ -14,11 +15,12 @@ function makeAdmissionDeps(overrides: Partial<AdmissionDeps> = {}): AdmissionDep
1415
calls,
1516
sweepExpired: async () => 0,
1617
queueDepth: async () => 0,
17-
isFireworksAdmissible: async () => true,
18-
admitFromQueue: async ({ isFireworksAdmissible }) => {
18+
getFireworksHealth: async () => 'healthy',
19+
admitFromQueue: async ({ getFireworksHealth }) => {
1920
calls.admit += 1
20-
if (!(await isFireworksAdmissible())) {
21-
return { admitted: [], skipped: 'health' }
21+
const health = await getFireworksHealth()
22+
if (health !== 'healthy') {
23+
return { admitted: [], skipped: health }
2224
}
2325
return { admitted: [{ user_id: 'u0' }], skipped: null }
2426
},
@@ -38,13 +40,22 @@ describe('runAdmissionTick', () => {
3840
expect(result.skipped).toBeNull()
3941
})
4042

41-
test('skips admission when Fireworks not healthy', async () => {
43+
test('skips admission when Fireworks is degraded', async () => {
4244
const deps = makeAdmissionDeps({
43-
isFireworksAdmissible: async () => false,
45+
getFireworksHealth: async () => 'degraded' as FireworksHealth,
4446
})
4547
const result = await runAdmissionTick(deps)
4648
expect(result.admitted).toBe(0)
47-
expect(result.skipped).toBe('health')
49+
expect(result.skipped).toBe('degraded')
50+
})
51+
52+
test('skips admission when Fireworks is unhealthy', async () => {
53+
const deps = makeAdmissionDeps({
54+
getFireworksHealth: async () => 'unhealthy' as FireworksHealth,
55+
})
56+
const result = await runAdmissionTick(deps)
57+
expect(result.admitted).toBe(0)
58+
expect(result.skipped).toBe('unhealthy')
4859
})
4960

5061
test('sweeps expired sessions even when skipping admission', async () => {
@@ -54,7 +65,7 @@ describe('runAdmissionTick', () => {
5465
swept = 3
5566
return 3
5667
},
57-
isFireworksAdmissible: async () => false,
68+
getFireworksHealth: async () => 'unhealthy' as FireworksHealth,
5869
})
5970
const result = await runAdmissionTick(deps)
6071
expect(swept).toBe(3)

0 commit comments

Comments
 (0)