Skip to content

Commit 4a0efb8

Browse files
jahoomaclaude
andcommitted
Detect cold Fireworks deployments; tighten TTFT/queue thresholds
Replicas=0 or no replicas metric at all (the deployment has been scaled to zero or dropped from the scrape) now flips that deployment's health to unhealthy unconditionally, so admission fails closed instead of funneling users to a backend that cannot serve traffic. Also drop generationQueueMs degraded 5000 -> 400 and ttftMs degraded 8000 -> 2000, and comment out the kimi deployment since only glm-5.1 is in production. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8ca704a commit 4a0efb8

File tree

6 files changed

+71
-4
lines changed

6 files changed

+71
-4
lines changed

scripts/check-fireworks-health.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ async function main() {
110110
console.log(`── ${color}${health.status.toUpperCase().padEnd(9)}${RESET} ${model}`)
111111
console.log(` deployment: ${deployment}`)
112112
console.log(` base model: ${health.baseModel ?? 'n/a'}`)
113+
console.log(` replicas: ${health.metrics.replicas ?? 'n/a'}`)
113114
console.log(` request rate: ${health.metrics.requestRate.toFixed(3)} req/s`)
114115
console.log(` error rate: ${health.metrics.errorRate.toFixed(3)} err/s (${formatPct(health.metrics.errorFraction)})`)
115116
console.log(` concurrent requests: ${health.metrics.concurrentRequests.toFixed(2)}`)

web/src/llm-api/fireworks-config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ export const FIREWORKS_ACCOUNT_ID = 'james-65d217'
1010

1111
export const FIREWORKS_DEPLOYMENT_MAP: Record<string, string> = {
1212
// 'minimax/minimax-m2.5': 'accounts/james-65d217/deployments/lnfid5h9',
13-
'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2',
13+
// 'moonshotai/kimi-k2.5': 'accounts/james-65d217/deployments/mx8l5rq2',
1414
'z-ai/glm-5.1': 'accounts/james-65d217/deployments/mjb4i7ea',
1515
}

web/src/server/fireworks-monitor/__tests__/compute-health.test.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,18 @@ function fixture(params: {
1818
kvSlots?: number
1919
queueBuckets?: Array<{ le: string; count: number }>
2020
ttftBuckets?: Array<{ le: string; count: number }>
21+
/** deployment_replicas gauge. Defaults to 1 so existing tests stay healthy.
22+
* Set to 0 or null to simulate a cold/deleted deployment. */
23+
replicas?: number | null
2124
}): string {
2225
const lines: string[] = []
2326
const labels = `base_model="m",deployment="${DEPLOYMENT}",deployment_account="test-acc",deployment_id="d1"`
27+
const replicas = params.replicas === undefined ? 1 : params.replicas
28+
if (replicas !== null) {
29+
lines.push(
30+
`deployment_replicas{deployment_account="test-acc",deployment_id="d1"} ${replicas}`,
31+
)
32+
}
2433
if (params.requestRate !== undefined) {
2534
lines.push(`request_counter_total:sum_by_deployment{${labels}} ${params.requestRate}`)
2635
}
@@ -182,9 +191,38 @@ describe('computeDeploymentHealth', () => {
182191
expect(health.reasons.some((r) => r.includes('error rate'))).toBe(true)
183192
})
184193

194+
test('flags deployment with zero replicas as unhealthy', () => {
195+
const metrics = parsePrometheusText(
196+
fixture({ requestRate: 0, errorRate: 0, kvBlocks: 0, replicas: 0 }),
197+
)
198+
const health = computeDeploymentHealth({
199+
deployment: DEPLOYMENT,
200+
metrics,
201+
thresholds: DEFAULT_HEALTH_THRESHOLDS,
202+
})
203+
expect(health.status).toBe('unhealthy')
204+
expect(health.metrics.replicas).toBe(0)
205+
expect(health.reasons.some((r) => r.includes('replicas'))).toBe(true)
206+
})
207+
208+
test('flags deployment with no replicas metric as unhealthy (cold / deleted)', () => {
209+
const metrics = parsePrometheusText(
210+
fixture({ requestRate: 0, errorRate: 0, kvBlocks: 0, replicas: null }),
211+
)
212+
const health = computeDeploymentHealth({
213+
deployment: DEPLOYMENT,
214+
metrics,
215+
thresholds: DEFAULT_HEALTH_THRESHOLDS,
216+
})
217+
expect(health.status).toBe('unhealthy')
218+
expect(health.metrics.replicas).toBeNull()
219+
expect(health.reasons.some((r) => r.includes('cold or deleted'))).toBe(true)
220+
})
221+
185222
test('sums error counters across multiple HTTP codes', () => {
186223
const labels = `base_model="m",deployment="${DEPLOYMENT}",deployment_id="d1"`
187224
const text = [
225+
`deployment_replicas{deployment_account="test-acc",deployment_id="d1"} 1`,
188226
`request_counter_total:sum_by_deployment{${labels}} 100`,
189227
`requests_error_total:sum_by_deployment{${labels},http_code="500"} 3`,
190228
`requests_error_total:sum_by_deployment{${labels},http_code="429"} 5`,
@@ -231,9 +269,11 @@ describe('computeSnapshot', () => {
231269
test('overall status is the worst across deployments', () => {
232270
const dep2 = 'accounts/test-acc/deployments/d2'
233271
const text = [
272+
`deployment_replicas{deployment_id="d1"} 1`,
234273
`request_counter_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 100`,
235274
`requests_error_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1",http_code="500"} 0`,
236275
`generator_kv_blocks_fraction:avg_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 0.1`,
276+
`deployment_replicas{deployment_id="d2"} 1`,
237277
`request_counter_total:sum_by_deployment{deployment="${dep2}",deployment_id="d2"} 100`,
238278
`requests_error_total:sum_by_deployment{deployment="${dep2}",deployment_id="d2",http_code="500"} 30`,
239279
`generator_kv_blocks_fraction:avg_by_deployment{deployment="${dep2}",deployment_id="d2"} 0.1`,

web/src/server/fireworks-monitor/__tests__/monitor.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ afterEach(() => {
1717
const DEPLOYMENT = 'accounts/test-acc/deployments/d1'
1818

1919
const HEALTHY_BODY = [
20+
`deployment_replicas{deployment_id="d1"} 1`,
2021
`request_counter_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 10`,
2122
`requests_error_total:sum_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1",http_code="500"} 0`,
2223
`generator_kv_blocks_fraction:avg_by_deployment{deployment="${DEPLOYMENT}",deployment_id="d1"} 0.1`,

web/src/server/fireworks-monitor/compute-health.ts

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ export const DEFAULT_HEALTH_THRESHOLDS: HealthThresholds = {
4949
errorFractionUnhealthy: 0.1,
5050
kvBlocksFractionDegraded: 0.95,
5151
kvBlocksFractionUnhealthy: 0.99,
52-
generationQueueMsDegraded: 5_000,
52+
generationQueueMsDegraded: 400,
5353
generationQueueMsUnhealthy: 15_000,
54-
ttftMsDegraded: 8_000,
54+
ttftMsDegraded: 2_000,
5555
ttftMsUnhealthy: 30_000,
5656
}
5757

@@ -69,6 +69,15 @@ export function computeDeploymentHealth(params: {
6969
}): DeploymentHealth {
7070
const { deployment, metrics, thresholds } = params
7171
const filter = { deployment }
72+
const deploymentId = parseDeploymentId(deployment)
73+
74+
// `deployment_replicas` is keyed by deployment_id (not the full deployment
75+
// path). Zero or missing replicas means the deployment is cold / scaled to
76+
// zero / deleted — admission must fail closed in that case.
77+
const replicasSamples = findSamples(metrics, 'deployment_replicas', {
78+
deployment_id: deploymentId,
79+
})
80+
const replicas = replicasSamples.length > 0 ? sumSamples(replicasSamples) : null
7281

7382
const requestRateSamples = findSamples(
7483
metrics,
@@ -121,7 +130,6 @@ export function computeDeploymentHealth(params: {
121130
...errorRateSamples,
122131
].find((s) => s.labels.base_model)
123132
const baseModel = baseModelSample?.labels.base_model ?? null
124-
const deploymentId = baseModelSample?.labels.deployment_id ?? parseDeploymentId(deployment)
125133

126134
const reasons: string[] = []
127135
let status: DeploymentHealthStatus = 'healthy'
@@ -130,6 +138,18 @@ export function computeDeploymentHealth(params: {
130138
if (STATUS_RANK[next] > STATUS_RANK[status]) status = next
131139
}
132140

141+
// A deployment with no running replicas cannot serve traffic. Treat as
142+
// unhealthy unconditionally so admission stops funneling users to a cold
143+
// backend. Missing gauge (`replicas === null`) is the strongest signal
144+
// Fireworks has dropped the deployment from its scrape entirely.
145+
if (replicas === null) {
146+
reasons.push('no replicas metric — deployment cold or deleted')
147+
upgrade('unhealthy')
148+
} else if (replicas <= 0) {
149+
reasons.push(`replicas=${replicas}`)
150+
upgrade('unhealthy')
151+
}
152+
133153
if (requestRate >= thresholds.minRequestRateForErrorCheck) {
134154
if (errorFraction >= thresholds.errorFractionUnhealthy) {
135155
reasons.push(`error rate ${(errorFraction * 100).toFixed(1)}% ≥ ${(thresholds.errorFractionUnhealthy * 100).toFixed(1)}%`)
@@ -175,6 +195,7 @@ export function computeDeploymentHealth(params: {
175195
status,
176196
reasons,
177197
metrics: {
198+
replicas,
178199
requestRate,
179200
errorRate,
180201
errorFraction,
@@ -223,6 +244,7 @@ export function computeSnapshot(params: {
223244
status: 'unknown',
224245
reasons: ['no scrape yet'],
225246
metrics: {
247+
replicas: null,
226248
requestRate: 0,
227249
errorRate: 0,
228250
errorFraction: 0,

web/src/server/fireworks-monitor/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ export interface DeploymentHealth {
1818
status: DeploymentHealthStatus
1919
reasons: string[]
2020
metrics: {
21+
/** null when Fireworks doesn't emit a deployment_replicas gauge for the
22+
* deployment (cold / deleted / not-yet-scraped). 0 means scaled-to-zero. */
23+
replicas: number | null
2124
requestRate: number
2225
errorRate: number
2326
errorFraction: number

0 commit comments

Comments
 (0)