Skip to content

Commit b01d2e3

Browse files
committed
Admit users by p90 of prefil queue time instead of p50
1 parent 1c294a0 commit b01d2e3

File tree

2 files changed

+25
-21
lines changed

2 files changed

+25
-21
lines changed

web/src/server/free-session/__tests__/fireworks-health.test.ts

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { describe, expect, test } from 'bun:test'
33
import {
44
KV_BLOCKS_DEGRADED_FRACTION,
55
KV_BLOCKS_UNHEALTHY_FRACTION,
6-
PREFILL_QUEUE_DEGRADED_MS,
6+
PREFILL_QUEUE_P90_DEGRADED_MS,
77
classify,
88
} from '../fireworks-health'
99

@@ -19,20 +19,22 @@ function kvBlocks(value: number): PromSample {
1919
}
2020
}
2121

22-
/** Emit a minimal cumulative-counts histogram for prefill queue where every
23-
* event lands in exactly one bucket `le`. */
24-
function prefillQueueBuckets(p50Ms: number): PromSample[] {
22+
/** Emit a cumulative-counts histogram for prefill queue where the p90
23+
* percentile falls in the bucket with le ≥ p90Ms (i.e. p90 ≥ p90Ms).
24+
* Uses 10 total events all landing in that bucket, so the 90th-percentile
25+
* interpolates within the bucket above the bucket boundary. */
26+
function prefillQueueBuckets(p90Ms: number): PromSample[] {
2527
const les = [50, 150, 300, 500, 750, 1000, 1500, 3000, 5000, 7500, 10000]
2628
const name = 'latency_prefill_queue_ms_bucket:sum_by_deployment'
27-
// cumulative count = 0 below p50, 1 at and above p50
29+
const total = 10
2830
return les.map((le) => ({
2931
name,
3032
labels: { deployment_id: DEPLOY, le: String(le) },
31-
value: le >= p50Ms ? 1 : 0,
33+
value: le >= p90Ms ? total : 0,
3234
})).concat({
3335
name,
3436
labels: { deployment_id: DEPLOY, le: '+Inf' },
35-
value: 1,
37+
value: total,
3638
})
3739
}
3840

@@ -58,10 +60,10 @@ describe('fireworks health classifier', () => {
5860
expect(classify(samples, [DEPLOY])).toBe('healthy')
5961
})
6062

61-
test('degraded when prefill queue p50 exceeds the threshold', () => {
63+
test('degraded when prefill queue p90 exceeds the threshold', () => {
6264
const samples: PromSample[] = [
6365
kvBlocks(0.5),
64-
...prefillQueueBuckets(PREFILL_QUEUE_DEGRADED_MS + 500),
66+
...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500),
6567
]
6668
expect(classify(samples, [DEPLOY])).toBe('degraded')
6769
})
@@ -110,7 +112,7 @@ describe('fireworks health classifier', () => {
110112
const other = 'other123'
111113
const samples: PromSample[] = [
112114
kvBlocks(0.5),
113-
...prefillQueueBuckets(PREFILL_QUEUE_DEGRADED_MS + 500),
115+
...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500),
114116
{
115117
name: 'generator_kv_blocks_fraction:avg_by_deployment',
116118
labels: { deployment_id: other },

web/src/server/free-session/fireworks-health.ts

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
import { FIREWORKS_ACCOUNT_ID, FIREWORKS_DEPLOYMENT_MAP } from '@/llm-api/fireworks-config'
21
import { env } from '@codebuff/internal/env'
2+
3+
import { FIREWORKS_ACCOUNT_ID, FIREWORKS_DEPLOYMENT_MAP } from '@/llm-api/fireworks-config'
34
import { logger } from '@/util/logger'
45

56
/**
@@ -15,13 +16,14 @@ import { logger } from '@/util/logger'
1516
*/
1617
export type FireworksHealth = 'healthy' | 'degraded' | 'unhealthy'
1718

18-
/** Degrade once median prefill-queue latency crosses this bound. Strict by
19-
* design — a 1s queue on top of ~1s prefill already means users feel 2s+
20-
* before first token. */
21-
export const PREFILL_QUEUE_DEGRADED_MS = 125
19+
/** Degrade once p90 prefill-queue latency crosses this bound. Using p90
20+
* instead of p50 gives a better early-warning signal — the tail starts
21+
* rising before the median does, so we can halt admission before most
22+
* users feel it. */
23+
export const PREFILL_QUEUE_P90_DEGRADED_MS = 1000
2224

2325
/** Leading indicator of load — responds instantly to memory pressure, while
24-
* prefill-queue p50 is a lagging window statistic. Degrading here lets us
26+
* prefill-queue p90 is a lagging window statistic. Degrading here lets us
2527
* halt admission *before* users feel it. */
2628
export const KV_BLOCKS_DEGRADED_FRACTION = 0.8
2729

@@ -160,16 +162,16 @@ function classifyOne(samples: PromSample[], deploymentId: string): FireworksHeal
160162
return 'unhealthy'
161163
}
162164

163-
const p50 = histogramPercentile(
165+
const p90 = histogramPercentile(
164166
samples,
165167
'latency_prefill_queue_ms_bucket:sum_by_deployment',
166168
deploymentId,
167-
50,
169+
90,
168170
)
169-
if (p50 !== undefined && p50 > PREFILL_QUEUE_DEGRADED_MS) {
171+
if (p90 !== undefined && p90 > PREFILL_QUEUE_P90_DEGRADED_MS) {
170172
logger.info(
171-
{ deploymentId, prefillQueueP50Ms: Math.round(p50), kvBlocks },
172-
'[FireworksHealth] degraded: prefill queue p50 over threshold',
173+
{ deploymentId, prefillQueueP90Ms: Math.round(p90), kvBlocks },
174+
'[FireworksHealth] degraded: prefill queue p90 over threshold',
173175
)
174176
return 'degraded'
175177
}

0 commit comments

Comments
 (0)