Skip to content

Commit c0864e8

Browse files
jahoomaclaude
andcommitted
Support multiple Fireworks deployments independently
Admit from the waiting room if any deployment is healthy (was worst-of across all). With one deployment per model — and per country in the future — a degraded deployment for one model shouldn't block users whose model routes elsewhere. Also make the DEPLOYMENT_SCALING_UP cooldown per-deployment; one deployment's 503 no longer poisons routing for the others. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent b01d2e3 commit c0864e8

File tree

5 files changed

+98
-30
lines changed

5 files changed

+98
-30
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@ debug/
3030

3131
# Infisical config (user-specific)
3232
.infisical.json
33+
.gstack/

web/src/llm-api/__tests__/fireworks-deployment.test.ts

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,35 @@ describe('Fireworks deployment routing', () => {
4545
})
4646

4747
it('isDeploymentCoolingDown returns false initially', () => {
48-
expect(isDeploymentCoolingDown()).toBe(false)
48+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
4949
})
5050

5151
it('isDeploymentCoolingDown returns true after markDeploymentScalingUp', () => {
52-
markDeploymentScalingUp()
53-
expect(isDeploymentCoolingDown()).toBe(true)
52+
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
53+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
5454
})
5555

5656
it('isDeploymentCoolingDown returns false after resetDeploymentCooldown', () => {
57-
markDeploymentScalingUp()
58-
expect(isDeploymentCoolingDown()).toBe(true)
57+
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
58+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
5959
resetDeploymentCooldown()
60-
expect(isDeploymentCoolingDown()).toBe(false)
60+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
61+
})
62+
63+
it('cooldown on one deployment does not affect other deployments', () => {
64+
const otherDeployment = 'accounts/james-65d217/deployments/other123'
65+
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
66+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
67+
expect(isDeploymentCoolingDown(otherDeployment)).toBe(false)
68+
})
69+
70+
it('resetDeploymentCooldown with an id only clears that deployment', () => {
71+
const otherDeployment = 'accounts/james-65d217/deployments/other123'
72+
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
73+
markDeploymentScalingUp(otherDeployment)
74+
resetDeploymentCooldown(DEPLOYMENT_MODEL_ID)
75+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
76+
expect(isDeploymentCoolingDown(otherDeployment)).toBe(true)
6177
})
6278

6379
it('DEPLOYMENT_COOLDOWN_MS is 2 minutes', () => {
@@ -195,8 +211,8 @@ describe('Fireworks deployment routing', () => {
195211
expect(fetchCalls).toHaveLength(2)
196212
expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
197213
expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
198-
// Verify cooldown was activated
199-
expect(isDeploymentCoolingDown()).toBe(true)
214+
// Verify cooldown was activated for this specific deployment
215+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
200216
} finally {
201217
spy.restore()
202218
}
@@ -243,7 +259,7 @@ describe('Fireworks deployment routing', () => {
243259
expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
244260
expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
245261
// Non-scaling 503 should NOT activate the cooldown
246-
expect(isDeploymentCoolingDown()).toBe(false)
262+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
247263
} finally {
248264
spy.restore()
249265
}
@@ -283,15 +299,15 @@ describe('Fireworks deployment routing', () => {
283299
expect(fetchCalls).toHaveLength(2)
284300
expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
285301
expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
286-
expect(isDeploymentCoolingDown()).toBe(false)
302+
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
287303
} finally {
288304
spy.restore()
289305
}
290306
})
291307

292308
it('skips deployment during cooldown and goes straight to standard API', async () => {
293309
const spy = spyDeploymentHours(true)
294-
markDeploymentScalingUp()
310+
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
295311

296312
const fetchCalls: string[] = []
297313
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {

web/src/llm-api/fireworks.ts

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,22 +44,32 @@ export function isDeploymentHours(_now: Date = new Date()): boolean {
4444
}
4545

4646
/**
47-
* In-memory cooldown to avoid repeatedly hitting a deployment that is scaling up.
48-
* After a DEPLOYMENT_SCALING_UP 503, we skip the deployment for this many ms.
47+
* In-memory per-deployment cooldown to avoid repeatedly hitting a deployment
48+
* that is scaling up. After a DEPLOYMENT_SCALING_UP 503, we skip that
49+
* deployment for this many ms. Keyed by full deployment path so one
50+
* deployment's cooldown doesn't affect routing to other deployments.
4951
*/
5052
export const DEPLOYMENT_COOLDOWN_MS = 2 * 60 * 1000
51-
let deploymentScalingUpUntil = 0
52-
53-
export function isDeploymentCoolingDown(): boolean {
54-
return Date.now() < deploymentScalingUpUntil
53+
const deploymentCooldowns = new Map<string, number>()
54+
55+
export function isDeploymentCoolingDown(deploymentId: string): boolean {
56+
const until = deploymentCooldowns.get(deploymentId)
57+
if (until === undefined) return false
58+
if (Date.now() < until) return true
59+
deploymentCooldowns.delete(deploymentId)
60+
return false
5561
}
5662

57-
export function markDeploymentScalingUp(): void {
58-
deploymentScalingUpUntil = Date.now() + DEPLOYMENT_COOLDOWN_MS
63+
export function markDeploymentScalingUp(deploymentId: string): void {
64+
deploymentCooldowns.set(deploymentId, Date.now() + DEPLOYMENT_COOLDOWN_MS)
5965
}
6066

61-
export function resetDeploymentCooldown(): void {
62-
deploymentScalingUpUntil = 0
67+
export function resetDeploymentCooldown(deploymentId?: string): void {
68+
if (deploymentId === undefined) {
69+
deploymentCooldowns.clear()
70+
return
71+
}
72+
deploymentCooldowns.delete(deploymentId)
6373
}
6474

6575
export function isFireworksModel(model: string): boolean {
@@ -725,9 +735,9 @@ export async function createFireworksRequestWithFallback(params: {
725735
const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel]
726736
const shouldTryDeployment =
727737
useCustomDeployment &&
728-
deploymentModelId &&
738+
!!deploymentModelId &&
729739
isDeploymentHours() &&
730-
!isDeploymentCoolingDown()
740+
!isDeploymentCoolingDown(deploymentModelId)
731741

732742
if (shouldTryDeployment) {
733743
logger.info(
@@ -749,7 +759,7 @@ export async function createFireworksRequestWithFallback(params: {
749759
'Fireworks custom deployment returned 5xx, falling back to standard API',
750760
)
751761
if (errorText.includes('DEPLOYMENT_SCALING_UP')) {
752-
markDeploymentScalingUp()
762+
markDeploymentScalingUp(deploymentModelId)
753763
}
754764
// Fall through to standard API request below
755765
} else {

web/src/server/free-session/__tests__/fireworks-health.test.ts

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,43 @@ describe('fireworks health classifier', () => {
108108
expect(classify([], [DEPLOY])).toBe('healthy')
109109
})
110110

111-
test('worst-of across multiple deployments — unhealthy wins over degraded', () => {
111+
test('healthy if any deployment is healthy (one deployment per model, users route per-model)', () => {
112112
const other = 'other123'
113113
const samples: PromSample[] = [
114+
// DEPLOY is healthy
115+
kvBlocks(0.5),
116+
...prefillQueueBuckets(150),
117+
// other is unhealthy
118+
{
119+
name: 'generator_kv_blocks_fraction:avg_by_deployment',
120+
labels: { deployment_id: other },
121+
value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005,
122+
},
123+
]
124+
expect(classify(samples, [DEPLOY, other])).toBe('healthy')
125+
})
126+
127+
test('degraded when all deployments are non-healthy and at least one is degraded', () => {
128+
const other = 'other123'
129+
const samples: PromSample[] = [
130+
// DEPLOY is degraded (prefill queue over threshold)
114131
kvBlocks(0.5),
115132
...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500),
133+
// other is unhealthy (KV backstop)
134+
{
135+
name: 'generator_kv_blocks_fraction:avg_by_deployment',
136+
labels: { deployment_id: other },
137+
value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005,
138+
},
139+
]
140+
expect(classify(samples, [DEPLOY, other])).toBe('degraded')
141+
})
142+
143+
test('unhealthy only when every deployment is unhealthy', () => {
144+
const other = 'other123'
145+
const samples: PromSample[] = [
146+
kvBlocks(KV_BLOCKS_UNHEALTHY_FRACTION + 0.005),
147+
...prefillQueueBuckets(300),
116148
{
117149
name: 'generator_kv_blocks_fraction:avg_by_deployment',
118150
labels: { deployment_id: other },

web/src/server/free-session/fireworks-health.ts

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,18 +110,27 @@ async function probe(): Promise<FireworksHealth> {
110110
return classify(samples, deploymentIds)
111111
}
112112

113-
/** Treat the whole fleet as degraded/unhealthy if any single deployment is. */
113+
/** Admit if ANY deployment is healthy. One deployment per model (and per
114+
* region in the future) means a user's request routes to a specific
115+
* deployment based on their chosen model — a degraded or unhealthy
116+
* deployment for one model doesn't affect users whose model routes
117+
* elsewhere, and `createFireworksRequestWithFallback` falls back to the
118+
* standard Fireworks API on 5xx regardless. Only hold the queue when ALL
119+
* deployments are non-healthy. Degraded beats unhealthy so
120+
* observability/logs show we still have upstream reachable. */
114121
export function classify(
115122
samples: PromSample[],
116123
deploymentIds: string[],
117124
): FireworksHealth {
118-
let worst: FireworksHealth = 'healthy'
125+
if (deploymentIds.length === 0) return 'healthy'
126+
127+
let anyDegraded = false
119128
for (const deploymentId of deploymentIds) {
120129
const h = classifyOne(samples, deploymentId)
121-
if (h === 'unhealthy') return 'unhealthy'
122-
if (h === 'degraded') worst = 'degraded'
130+
if (h === 'healthy') return 'healthy'
131+
if (h === 'degraded') anyDegraded = true
123132
}
124-
return worst
133+
return anyDegraded ? 'degraded' : 'unhealthy'
125134
}
126135

127136
function classifyOne(samples: PromSample[], deploymentId: string): FireworksHealth {

0 commit comments

Comments
 (0)