Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ debug/

# Infisical config (user-specific)
.infisical.json
.gstack/
38 changes: 27 additions & 11 deletions web/src/llm-api/__tests__/fireworks-deployment.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,35 @@ describe('Fireworks deployment routing', () => {
})

it('isDeploymentCoolingDown returns false initially', () => {
expect(isDeploymentCoolingDown()).toBe(false)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
})

it('isDeploymentCoolingDown returns true after markDeploymentScalingUp', () => {
markDeploymentScalingUp()
expect(isDeploymentCoolingDown()).toBe(true)
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
})

it('isDeploymentCoolingDown returns false after resetDeploymentCooldown', () => {
markDeploymentScalingUp()
expect(isDeploymentCoolingDown()).toBe(true)
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
resetDeploymentCooldown()
expect(isDeploymentCoolingDown()).toBe(false)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
})

it('cooldown on one deployment does not affect other deployments', () => {
const otherDeployment = 'accounts/james-65d217/deployments/other123'
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
expect(isDeploymentCoolingDown(otherDeployment)).toBe(false)
})

it('resetDeploymentCooldown with an id only clears that deployment', () => {
const otherDeployment = 'accounts/james-65d217/deployments/other123'
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)
markDeploymentScalingUp(otherDeployment)
resetDeploymentCooldown(DEPLOYMENT_MODEL_ID)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
expect(isDeploymentCoolingDown(otherDeployment)).toBe(true)
})

it('DEPLOYMENT_COOLDOWN_MS is 2 minutes', () => {
Expand Down Expand Up @@ -195,8 +211,8 @@ describe('Fireworks deployment routing', () => {
expect(fetchCalls).toHaveLength(2)
expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
// Verify cooldown was activated
expect(isDeploymentCoolingDown()).toBe(true)
// Verify cooldown was activated for this specific deployment
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(true)
} finally {
spy.restore()
}
Expand Down Expand Up @@ -243,7 +259,7 @@ describe('Fireworks deployment routing', () => {
expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
// Non-scaling 503 should NOT activate the cooldown
expect(isDeploymentCoolingDown()).toBe(false)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
} finally {
spy.restore()
}
Expand Down Expand Up @@ -283,15 +299,15 @@ describe('Fireworks deployment routing', () => {
expect(fetchCalls).toHaveLength(2)
expect(fetchCalls[0]).toBe(DEPLOYMENT_MODEL_ID)
expect(fetchCalls[1]).toBe(STANDARD_MODEL_ID)
expect(isDeploymentCoolingDown()).toBe(false)
expect(isDeploymentCoolingDown(DEPLOYMENT_MODEL_ID)).toBe(false)
} finally {
spy.restore()
}
})

it('skips deployment during cooldown and goes straight to standard API', async () => {
const spy = spyDeploymentHours(true)
markDeploymentScalingUp()
markDeploymentScalingUp(DEPLOYMENT_MODEL_ID)

const fetchCalls: string[] = []
const mockFetch = mock(async (_url: string | URL | Request, init?: RequestInit) => {
Expand Down
36 changes: 23 additions & 13 deletions web/src/llm-api/fireworks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,32 @@ export function isDeploymentHours(_now: Date = new Date()): boolean {
}

/**
* In-memory cooldown to avoid repeatedly hitting a deployment that is scaling up.
* After a DEPLOYMENT_SCALING_UP 503, we skip the deployment for this many ms.
* In-memory per-deployment cooldown to avoid repeatedly hitting a deployment
* that is scaling up. After a DEPLOYMENT_SCALING_UP 503, we skip that
* deployment for this many ms. Keyed by full deployment path so one
* deployment's cooldown doesn't affect routing to other deployments.
*/
export const DEPLOYMENT_COOLDOWN_MS = 2 * 60 * 1000
let deploymentScalingUpUntil = 0

export function isDeploymentCoolingDown(): boolean {
return Date.now() < deploymentScalingUpUntil
const deploymentCooldowns = new Map<string, number>()

export function isDeploymentCoolingDown(deploymentId: string): boolean {
const until = deploymentCooldowns.get(deploymentId)
if (until === undefined) return false
if (Date.now() < until) return true
deploymentCooldowns.delete(deploymentId)
return false
}

export function markDeploymentScalingUp(): void {
deploymentScalingUpUntil = Date.now() + DEPLOYMENT_COOLDOWN_MS
export function markDeploymentScalingUp(deploymentId: string): void {
deploymentCooldowns.set(deploymentId, Date.now() + DEPLOYMENT_COOLDOWN_MS)
}

export function resetDeploymentCooldown(): void {
deploymentScalingUpUntil = 0
export function resetDeploymentCooldown(deploymentId?: string): void {
if (deploymentId === undefined) {
deploymentCooldowns.clear()
return
}
deploymentCooldowns.delete(deploymentId)
}

export function isFireworksModel(model: string): boolean {
Expand Down Expand Up @@ -725,9 +735,9 @@ export async function createFireworksRequestWithFallback(params: {
const deploymentModelId = FIREWORKS_DEPLOYMENT_MAP[originalModel]
const shouldTryDeployment =
useCustomDeployment &&
deploymentModelId &&
!!deploymentModelId &&
isDeploymentHours() &&
!isDeploymentCoolingDown()
!isDeploymentCoolingDown(deploymentModelId)

if (shouldTryDeployment) {
logger.info(
Expand All @@ -749,7 +759,7 @@ export async function createFireworksRequestWithFallback(params: {
'Fireworks custom deployment returned 5xx, falling back to standard API',
)
if (errorText.includes('DEPLOYMENT_SCALING_UP')) {
markDeploymentScalingUp()
markDeploymentScalingUp(deploymentModelId)
}
// Fall through to standard API request below
} else {
Expand Down
34 changes: 33 additions & 1 deletion web/src/server/free-session/__tests__/fireworks-health.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,43 @@ describe('fireworks health classifier', () => {
expect(classify([], [DEPLOY])).toBe('healthy')
})

test('worst-of across multiple deployments — unhealthy wins over degraded', () => {
test('healthy if any deployment is healthy (one deployment per model, users route per-model)', () => {
const other = 'other123'
const samples: PromSample[] = [
// DEPLOY is healthy
kvBlocks(0.5),
...prefillQueueBuckets(150),
// other is unhealthy
{
name: 'generator_kv_blocks_fraction:avg_by_deployment',
labels: { deployment_id: other },
value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005,
},
]
expect(classify(samples, [DEPLOY, other])).toBe('healthy')
})

test('degraded when all deployments are non-healthy and at least one is degraded', () => {
const other = 'other123'
const samples: PromSample[] = [
// DEPLOY is degraded (prefill queue over threshold)
kvBlocks(0.5),
...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500),
// other is unhealthy (KV backstop)
{
name: 'generator_kv_blocks_fraction:avg_by_deployment',
labels: { deployment_id: other },
value: KV_BLOCKS_UNHEALTHY_FRACTION + 0.005,
},
]
expect(classify(samples, [DEPLOY, other])).toBe('degraded')
})

test('unhealthy only when every deployment is unhealthy', () => {
const other = 'other123'
const samples: PromSample[] = [
kvBlocks(KV_BLOCKS_UNHEALTHY_FRACTION + 0.005),
...prefillQueueBuckets(300),
{
name: 'generator_kv_blocks_fraction:avg_by_deployment',
labels: { deployment_id: other },
Expand Down
19 changes: 14 additions & 5 deletions web/src/server/free-session/fireworks-health.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,27 @@ async function probe(): Promise<FireworksHealth> {
return classify(samples, deploymentIds)
}

/** Treat the whole fleet as degraded/unhealthy if any single deployment is. */
/** Admit if ANY deployment is healthy. One deployment per model (and per
* region in the future) means a user's request routes to a specific
* deployment based on their chosen model — a degraded or unhealthy
* deployment for one model doesn't affect users whose model routes
* elsewhere, and `createFireworksRequestWithFallback` falls back to the
* standard Fireworks API on 5xx regardless. Only hold the queue when ALL
* deployments are non-healthy. Degraded beats unhealthy so
* observability/logs show we still have upstream reachable. */
export function classify(
samples: PromSample[],
deploymentIds: string[],
): FireworksHealth {
let worst: FireworksHealth = 'healthy'
if (deploymentIds.length === 0) return 'healthy'

let anyDegraded = false
for (const deploymentId of deploymentIds) {
const h = classifyOne(samples, deploymentId)
if (h === 'unhealthy') return 'unhealthy'
if (h === 'degraded') worst = 'degraded'
if (h === 'healthy') return 'healthy'
if (h === 'degraded') anyDegraded = true
}
return worst
return anyDegraded ? 'degraded' : 'unhealthy'
}

function classifyOne(samples: PromSample[], deploymentId: string): FireworksHealth {
Expand Down
Loading