-
Notifications
You must be signed in to change notification settings - Fork 27
fix: Gemma-4 QuantizedKVCache + kv_bits API + Test 9 (mlx-swift-lm b440) #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c1b90f1
f007b3b
ccccdeb
ed5f8f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1048,9 +1048,20 @@ func handleChatCompletion( | |
| // These are accepted but may not affect generation if MLX doesn't support them | ||
| } | ||
|
|
||
| // ── Validate kv_bits: only nil, 4, and 8 are supported ── | ||
| if let kb = chatReq.kvBits, kb != 4 && kb != 8 { | ||
| let errBody = "{\"error\":{\"message\":\"Invalid kv_bits value \(kb). Supported values are 4 and 8.\",\"type\":\"invalid_request_error\",\"code\":\"invalid_kv_bits\"}}" | ||
| return Response( | ||
| status: .badRequest, | ||
| headers: jsonHeaders(), | ||
| body: .init(byteBuffer: ByteBuffer(string: errBody)) | ||
| ) | ||
| } | ||
|
|
||
| let params = GenerateParameters( | ||
| maxTokens: tokenLimit, | ||
| maxKVSize: config.ctxSize, | ||
| kvBits: chatReq.kvBits, | ||
|
||
| temperature: temperature, | ||
| topP: topP, | ||
| topK: topK, | ||
|
|
@@ -1200,9 +1211,13 @@ func handleChatCompletion( | |
| // raw <|image|>/<|audio|> token embeddings instead of the projected features. | ||
| let isMultimodalRequest = lmInput.image != nil || lmInput.audio != nil | ||
|
|
||
| // Try to restore via token-by-token prefix match (llama-server style) | ||
| // Try to restore via token-by-token prefix match (llama-server style). | ||
| // Skip for quantized-KV requests: the prompt cache stores KV state produced | ||
| // with KVCacheSimple; restoring it into a QuantizedKVCache (or vice-versa) | ||
| // is unsafe and produces incorrect results or runtime failures. | ||
| let skipPromptCache = isMultimodalRequest || params.kvBits != nil | ||
| var stream: AsyncStream<Generation> | ||
| if !isMultimodalRequest, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) { | ||
| if !skipPromptCache, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) { | ||
| // Cache hit: KV state is pre-populated up to cachedCount tokens. | ||
| // Only compute the remaining (new) tokens. | ||
| var startIndex = cachedCount | ||
|
|
@@ -1251,6 +1266,10 @@ func handleChatCompletion( | |
| let onPrefillDone: (() async -> Void)? = { | ||
| if turboHasCompressed { | ||
| print("[SwiftLM] 🧠 Skipping prompt cache save — TurboQuant has compressed \(cache.compactMap { ($0 as? KVCacheSimple)?.compressedOffset }.max() ?? 0) tokens. Saving would decode ~37 GB back to fp16.") | ||
| } else if params.kvBits != nil { | ||
| // kv_bits is set: the cache contains QuantizedKVCache layers whose token | ||
| // format is incompatible with the FP16 KVCacheSimple format expected by | ||
| // promptCache.save. Skip saving to prevent unsafe mixed-format restores. | ||
| } else { | ||
| await promptCache.save(tokens: promptTokens, cache: cache) | ||
| } | ||
|
|
@@ -2305,6 +2324,10 @@ struct ChatCompletionRequest: Decodable { | |
| let chatTemplateKwargs: [String: Bool]? | ||
| /// Top-level thinking override emitted by Aegis-AI gateway | ||
| let enableThinking: Bool? | ||
| /// Number of bits for native MLX quantized KV cache (nil = no quantization). | ||
| /// Only 4 and 8 are supported by the underlying MLX QuantizedKVCache. | ||
| /// Enables `QuantizedKVCache` instead of `KVCacheSimple`. Separate from `--turbo-kv`. | ||
| let kvBits: Int? | ||
|
|
||
| enum CodingKeys: String, CodingKey { | ||
| case model, messages, stream, temperature, tools, stop, seed | ||
|
|
@@ -2319,6 +2342,7 @@ struct ChatCompletionRequest: Decodable { | |
| case responseFormat = "response_format" | ||
| case chatTemplateKwargs = "chat_template_kwargs" | ||
| case enableThinking = "enable_thinking" | ||
| case kvBits = "kv_bits" | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
kvBitsis threaded intoGenerateParameters, but the prompt cache currently keys only on token prefix and will happily restore a cached KV state produced with different generation parameters. With a per-requestkv_bits, this can mix quantized and non-quantized cache states across requests, which is unsafe and can lead to incorrect results or runtime failures. Consider includingkvBits(and any other cache-shaping params) in the prompt-cache key, or disabling prompt-cache restore/save wheneverchatReq.kvBitsis non-nil.