From c1b90f174e9604d79f4676c29cc72727ceb590dd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:18:50 -0700 Subject: [PATCH 1/4] feat: Gemma-4 QuantizedKVCache fix + Test 9 regression (mlx-swift-lm b440) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bumps mlx-swift-lm submodule to b440 (tag) / 63707c0: fix(Gemma4Text): dispatch QuantizedKVCache correctly in LLM attention (merges PR #29, closes SharpAI/SwiftLM#71) - Server.swift: expose `kv_bits` as a per-request API field (ChatCompletionRequest.kvBits -> GenerateParameters.kvBits) enabling native MLX QuantizedKVCache without a server restart. - run_benchmark.sh: add Test 9 — QuantizedKVCache regression suite [1/4] kv_bits=4 short [2/4] kv_bits=8 short [3/4] kv_bits=4 long (KV-sharing path) [4/4] baseline Test 9 passed on mlx-community/gemma-4-26b-a4b-it-4bit. --- Sources/SwiftLM/Server.swift | 5 + mlx-swift-lm | 2 +- run_benchmark.sh | 184 +++++++++++++++++++++++++++++++++-- 3 files changed, 182 insertions(+), 9 deletions(-) diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift index 17d68d37..094038b9 100644 --- a/Sources/SwiftLM/Server.swift +++ b/Sources/SwiftLM/Server.swift @@ -1051,6 +1051,7 @@ func handleChatCompletion( let params = GenerateParameters( maxTokens: tokenLimit, maxKVSize: config.ctxSize, + kvBits: chatReq.kvBits, temperature: temperature, topP: topP, topK: topK, @@ -2305,6 +2306,9 @@ struct ChatCompletionRequest: Decodable { let chatTemplateKwargs: [String: Bool]? /// Top-level thinking override emitted by Aegis-AI gateway let enableThinking: Bool? + /// Number of bits for native MLX quantized KV cache (nil = no quantization, 4 or 8 typical). + /// Enables `QuantizedKVCache` instead of `KVCacheSimple`. Separate from `--turbo-kv`. + let kvBits: Int? enum CodingKeys: String, CodingKey { case model, messages, stream, temperature, tools, stop, seed @@ -2319,6 +2323,7 @@ struct ChatCompletionRequest: Decodable { case responseFormat = "response_format" case chatTemplateKwargs = "chat_template_kwargs" case enableThinking = "enable_thinking" + case kvBits = "kv_bits" } } diff --git a/mlx-swift-lm b/mlx-swift-lm index 71a77e07..63707c0c 160000 --- a/mlx-swift-lm +++ b/mlx-swift-lm @@ -1 +1 @@ -Subproject commit 71a77e07b4936599cc40c4a423458c2bc834a0cc +Subproject commit 63707c0ccde78daa63ceb0575af52edc9d941c07 diff --git a/run_benchmark.sh b/run_benchmark.sh index 8ad40921..92b47b61 100755 --- a/run_benchmark.sh +++ b/run_benchmark.sh @@ -102,8 +102,9 @@ echo "5) Test 5: ALM Audio End-to-End Evaluation" echo "6) Test 6: Omni End-to-End Evaluation" echo "7) Model Maintain List and Delete" echo "8) Test 8: Tool-Call Degeneration Regression (Gemma-4 vague-query bug)" -echo "9) Quit" -read -p "Option (0-9): " suite_opt +echo "9) Test 9: Quantized KV Cache Regression (Gemma-4 issue #71 — native kv_bits)" +echo "q) Quit" +read -p "Option (0-9/q): " suite_opt if [ "$suite_opt" == "0" ]; then echo "==============================================" @@ -131,12 +132,13 @@ if [ "$suite_opt" == "0" ]; then exit 0 fi -if [ "$suite_opt" == "9" ] || [ "$suite_opt" == "8" ] || [ -z "$suite_opt" ]; then - # 9 = Quit (old 8), 8 = Test 8 — only exit on 9 or blank - if [ "$suite_opt" == "9" ] || [ -z "$suite_opt" ]; then - echo "Exiting." - exit 0 - fi +if [ "$suite_opt" == "q" ] || [ -z "$suite_opt" ]; then + echo "Exiting." + exit 0 +fi + +if [ "$suite_opt" == "9" ] || [ "$suite_opt" == "8" ]; then + : # handled below — fall through fi if [ "$suite_opt" == "7" ]; then @@ -969,6 +971,172 @@ EOF exit 0 fi +# ── Test 9: QuantizedKVCache Regression (issue #71) ──────────────────────── +# Verifies that Gemma-4 text models can decode with native MLX QuantizedKVCache +# (kv_bits=4 and kv_bits=8) without triggering the: +# fatalError: `update` was called on `QuantizedKVCache`. Use `updateQuantized`. +# crash fixed in PR #29 of mlx-swift-lm. +# +# Pass criteria: +# - 4-bit run: prefill + ≥20 decode tokens, response is non-empty coherent text +# - 8-bit run: same +# - Multi-turn run: second turn with kv_bits=4 also succeeds (exercises sharedKV path) +if [ "$suite_opt" == "9" ]; then + echo "" + echo "=> Test 9: Quantized KV Cache Regression (issue #71) on $FULL_MODEL" + echo " Tests MLX native QuantizedKVCache (kv_bits=4, kv_bits=8) — NOT TurboKV" + echo " This exercises the fix in mlx-swift-lm PR #29." + + echo "Starting server on port 5431..." + killall SwiftLM 2>/dev/null + mkdir -p tmp + # No --turbo-kv flag: we want the vanilla KVCacheSimple path that will be + # upgraded to QuantizedKVCache by the per-request kv_bits field. + $BIN --model "$FULL_MODEL" --port 5431 --stream-experts --ctx-size 8192 > ./tmp/kvcache_regression.log 2>&1 & + SERVER_PID=$! + + echo "Waiting for server (up to 180s)..." + for i in {1..180}; do + if ! kill -0 $SERVER_PID 2>/dev/null; then + echo "❌ Server died early. Logs:" + print_server_log ./tmp/kvcache_regression.log + exit 1 + fi + if curl -sf http://127.0.0.1:5431/health > /dev/null 2>&1; then + echo "Server ready (${i}s)" + break + fi + sleep 1 + done + + echo "" + echo "Running QuantizedKVCache regression suite..." + + python3 - << 'KVBITS_EOF' +import json, urllib.request, time, sys, re + +BASE = "http://127.0.0.1:5431" + +FAILS = [] + +def call(messages, kv_bits=None, max_tokens=60, temperature=0.0): + payload = { + "messages": messages, + "max_tokens": max_tokens, + "temperature": temperature, + "stream": False, + } + if kv_bits is not None: + payload["kv_bits"] = kv_bits + req = urllib.request.Request( + f"{BASE}/v1/chat/completions", + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"}, + ) + t0 = time.time() + try: + with urllib.request.urlopen(req, timeout=180) as r: + d = json.loads(r.read()) + except Exception as e: + return None, str(e), time.time() - t0 + elapsed = time.time() - t0 + content = d["choices"][0]["message"].get("content") or "" + # Strip Gemma-4 thinking blocks + content = re.sub(r"<\|channel\|>thought.*?", "", content, flags=re.DOTALL).strip() + return d, content, elapsed + +MSGS_SHORT = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Name the three primary colours. Be brief."}, +] + +# Longer prompt to exercise the KV sharing layers (last 20 of Gemma-4 share KV +# from earlier layers — the bug manifests at those layers on multi-token prefills). +MSGS_LONG = [ + {"role": "system", "content": "You are a knowledgeable AI assistant. Answer concisely."}, + {"role": "user", "content": "Explain in two sentences why the sky appears blue during the day and red at sunset. Use physics terminology."}, +] + +# ── [1] 4-bit quantized KV cache ── +print("\n─── [1/4] kv_bits=4, short prompt ───") +d, content, t = call(MSGS_SHORT, kv_bits=4) +if d is None: + print(f" ❌ CRASHED: {content}") + FAILS.append("kv_bits=4 short: server crash or timeout") +else: + gen_toks = d["usage"]["completion_tokens"] + ok = len(content.strip()) > 5 and gen_toks >= 3 + print(f" {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:100]}") + if not ok: + FAILS.append(f"kv_bits=4 short: too few tokens or empty ({gen_toks} tokens)") + +# ── [2] 8-bit quantized KV cache ── +print("\n─── [2/4] kv_bits=8, short prompt ───") +d, content, t = call(MSGS_SHORT, kv_bits=8) +if d is None: + print(f" ❌ CRASHED: {content}") + FAILS.append("kv_bits=8 short: server crash or timeout") +else: + gen_toks = d["usage"]["completion_tokens"] + ok = len(content.strip()) > 5 and gen_toks >= 3 + print(f" {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:100]}") + if not ok: + FAILS.append(f"kv_bits=8 short: too few tokens or empty ({gen_toks} tokens)") + +# ── [3] 4-bit, longer prompt (exercises KV-sharing layers) ── +print("\n─── [3/4] kv_bits=4, longer prompt (exercises KV-sharing path) ───") +d, content, t = call(MSGS_LONG, kv_bits=4, max_tokens=120) +if d is None: + print(f" ❌ CRASHED: {content}") + FAILS.append("kv_bits=4 long: server crash or timeout") +else: + gen_toks = d["usage"]["completion_tokens"] + ok = len(content.strip()) > 10 and gen_toks >= 5 + print(f" {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:120]}") + if not ok: + FAILS.append(f"kv_bits=4 long: too few tokens or empty ({gen_toks} tokens)") + +# ── [4] Baseline without kv_bits (must still work — regression guard) ── +print("\n─── [4/4] kv_bits=None baseline (no quantization) ───") +d, content, t = call(MSGS_SHORT, kv_bits=None) +if d is None: + print(f" ❌ CRASHED: {content}") + FAILS.append("baseline (no kv_bits): server crash or timeout") +else: + gen_toks = d["usage"]["completion_tokens"] + ok = len(content.strip()) > 5 and gen_toks >= 3 + print(f" {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:100]}") + if not ok: + FAILS.append(f"baseline: too few tokens or empty ({gen_toks} tokens)") + +print("\n" + "─" * 60) +if not FAILS: + print("✅ REGRESSION PASSED — QuantizedKVCache dispatches correctly.") + print(" kv_bits=4 ✓ | kv_bits=8 ✓ | KV-sharing path ✓ | baseline ✓") + sys.exit(0) +else: + print("❌ REGRESSION FAILED:") + for f in FAILS: + print(f" • {f}") + print("\n Root cause (if kv_bits runs crash): unconditional `cache.update()` call") + print(" in Gemma4TextAttention.callAsFunction — see mlx-swift-lm PR #29.") + sys.exit(1) +KVBITS_EOF + TEST9_EXIT=$? + + echo "" + echo "Cleaning up..." + kill $SERVER_PID 2>/dev/null + wait $SERVER_PID 2>/dev/null + + if [ $TEST9_EXIT -eq 0 ]; then + echo "✅ Test 9 PASSED" + else + echo "❌ Test 9 FAILED — see output above." + fi + exit $TEST9_EXIT +fi + # Fallback to Test 1 for anything else echo "" read -p "Enter context lengths to test [default: 512,40000,100000]: " CONTEXTS From f007b3baa32fba5ca7e8777edb3ed1603eda9ea8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:33:10 -0700 Subject: [PATCH 2/4] docs+fix: kv_bits README docs + address Copilot review on PR #73 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit README.md: - Added '🔧 Per-Request API Parameters' section with kv_bits table, kv_bits vs --turbo-kv comparison table, and curl usage example - Clarified --turbo-kv CLI entry: 'activates after 2048 tokens, server-wide' Server.swift: - Added kv_bits input validation (only nil/4/8 accepted; returns 400 otherwise) - Bypass prompt cache restore when kv_bits is set (prevents unsafe mixing of QuantizedKVCache and KVCacheSimple states across requests) - Bypass prompt cache save when kv_bits is set (same safety reason) run_benchmark.sh (Test 9): - Corrected header comment to match actual assertions (removed false ≥20 token and multi-turn claims; stated actual ≥3 token / non-empty checks) - Added explicit SERVER_READY flag + post-loop failure with log dump - Widened thinking-block regex to handle both <|channel|>thought and <|channel>thought --- README.md | 38 +++++++++++++++++++++++++++++++++++- Sources/SwiftLM/Server.swift | 25 +++++++++++++++++++++--- run_benchmark.sh | 18 ++++++++++++----- 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 9bf4bacd..e12b4999 100644 --- a/README.md +++ b/README.md @@ -352,10 +352,46 @@ curl http://localhost:5413/v1/chat/completions \ | `--min-p` | `0.0` | Default min-p sampling threshold relative to the highest probability token (0 disables) | | `--gpu-layers` | `model_default`| Restrict the amount of layers allocated to GPU hardware | | `--stream-experts` | `false` | Enable SSD expert streaming for MoE models (10x speedup) | -| `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression | +| `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression (activates after 2048 tokens, server-wide) | | `--draft-model` | (none) | Draft model path/ID for speculative decoding (in-RAM models only) | | `--num-draft-tokens` | `4` | Number of draft tokens per speculation round | +## 🔧 Per-Request API Parameters + +In addition to the standard OpenAI fields (`temperature`, `top_p`, `max_tokens`, etc.), SwiftLM accepts the following **SwiftLM-specific** fields on `POST /v1/chat/completions`: + +| Field | Type | Description | +|---|---|---| +| `kv_bits` | `int` (4 or 8) | Enable **MLX-native quantized KV cache** for this request. Uses `QuantizedKVCache` (standard group quantization) instead of `KVCacheSimple`. Separate from `--turbo-kv`. Reduces KV memory ~2–4× at mild quality cost. | +| `enable_thinking` | `bool` | Force-enable or disable chain-of-thought thinking blocks for Gemma-4 / Qwen3. | +| `kv_group_size` | `int` | Group size for `kv_bits` quantization (default: `64`). | +| `top_k` | `int` | Per-request top-k sampling override (0 = disabled). | +| `min_p` | `float` | Per-request min-p sampling threshold (0 = disabled). | +| `repetition_penalty` | `float` | Token repetition penalty (e.g. `1.15`). | + +### `kv_bits` vs `--turbo-kv` — What's the difference? + +| | `kv_bits` (per-request) | `--turbo-kv` (server flag) | +|---|---|---| +| **Scope** | Per-request, sent in JSON body | Server-wide, set at startup | +| **Algorithm** | MLX-native group quantization (4-bit / 8-bit) | Custom 3-bit PolarQuant + QJL Walsh-Hadamard | +| **Activation** | From token 0 | After 2048 tokens | +| **Memory savings** | ~2–4× vs FP16 | ~3.5× vs FP16 | +| **Use case** | Targeted memory reduction per conversation | Extreme long-context (100K+) compression | + +### Example: Enable 4-bit KV cache per request +```bash +curl http://localhost:5413/v1/chat/completions \\ + -H "Content-Type: application/json" \\ + -d '{ + "model": "gemma-4-26b-a4b-it-4bit", + "kv_bits": 4, + "messages": [ + {"role": "user", "content": "Summarize the history of computing in 3 sentences."} + ] + }' +``` + ## 📦 Requirements - macOS 14.0+ diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift index 094038b9..c6e416d3 100644 --- a/Sources/SwiftLM/Server.swift +++ b/Sources/SwiftLM/Server.swift @@ -1048,6 +1048,16 @@ func handleChatCompletion( // These are accepted but may not affect generation if MLX doesn't support them } + // ── Validate kv_bits: only nil, 4, and 8 are supported ── + if let kb = chatReq.kvBits, kb != 4 && kb != 8 { + let errBody = "{\"error\":{\"message\":\"Invalid kv_bits value \(kb). Supported values are 4 and 8.\",\"type\":\"invalid_request_error\",\"code\":\"invalid_kv_bits\"}}" + return Response( + status: .badRequest, + headers: jsonHeaders(), + body: .init(byteBuffer: ByteBuffer(string: errBody)) + ) + } + let params = GenerateParameters( maxTokens: tokenLimit, maxKVSize: config.ctxSize, @@ -1201,9 +1211,13 @@ func handleChatCompletion( // raw <|image|>/<|audio|> token embeddings instead of the projected features. let isMultimodalRequest = lmInput.image != nil || lmInput.audio != nil - // Try to restore via token-by-token prefix match (llama-server style) + // Try to restore via token-by-token prefix match (llama-server style). + // Skip for quantized-KV requests: the prompt cache stores KV state produced + // with KVCacheSimple; restoring it into a QuantizedKVCache (or vice-versa) + // is unsafe and produces incorrect results or runtime failures. + let skipPromptCache = isMultimodalRequest || params.kvBits != nil var stream: AsyncStream - if !isMultimodalRequest, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) { + if !skipPromptCache, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) { // Cache hit: KV state is pre-populated up to cachedCount tokens. // Only compute the remaining (new) tokens. var startIndex = cachedCount @@ -1252,6 +1266,10 @@ func handleChatCompletion( let onPrefillDone: (() async -> Void)? = { if turboHasCompressed { print("[SwiftLM] 🧠 Skipping prompt cache save — TurboQuant has compressed \(cache.compactMap { ($0 as? KVCacheSimple)?.compressedOffset }.max() ?? 0) tokens. Saving would decode ~37 GB back to fp16.") + } else if params.kvBits != nil { + // kv_bits is set: the cache contains QuantizedKVCache layers whose token + // format is incompatible with the FP16 KVCacheSimple format expected by + // promptCache.save. Skip saving to prevent unsafe mixed-format restores. } else { await promptCache.save(tokens: promptTokens, cache: cache) } @@ -2306,7 +2324,8 @@ struct ChatCompletionRequest: Decodable { let chatTemplateKwargs: [String: Bool]? /// Top-level thinking override emitted by Aegis-AI gateway let enableThinking: Bool? - /// Number of bits for native MLX quantized KV cache (nil = no quantization, 4 or 8 typical). + /// Number of bits for native MLX quantized KV cache (nil = no quantization). + /// Only 4 and 8 are supported by the underlying MLX QuantizedKVCache. /// Enables `QuantizedKVCache` instead of `KVCacheSimple`. Separate from `--turbo-kv`. let kvBits: Int? diff --git a/run_benchmark.sh b/run_benchmark.sh index 92b47b61..88b1dc86 100755 --- a/run_benchmark.sh +++ b/run_benchmark.sh @@ -978,9 +978,10 @@ fi # crash fixed in PR #29 of mlx-swift-lm. # # Pass criteria: -# - 4-bit run: prefill + ≥20 decode tokens, response is non-empty coherent text +# - 4-bit run: server does not crash, returns non-empty text response (≥3 tokens) # - 8-bit run: same -# - Multi-turn run: second turn with kv_bits=4 also succeeds (exercises sharedKV path) +# - Longer prompt run: exercises the last-20-layer KV-sharing path, same pass criteria +# - Baseline (no kv_bits): regression guard that the non-quantized path still works if [ "$suite_opt" == "9" ]; then echo "" echo "=> Test 9: Quantized KV Cache Regression (issue #71) on $FULL_MODEL" @@ -995,7 +996,7 @@ if [ "$suite_opt" == "9" ]; then $BIN --model "$FULL_MODEL" --port 5431 --stream-experts --ctx-size 8192 > ./tmp/kvcache_regression.log 2>&1 & SERVER_PID=$! - echo "Waiting for server (up to 180s)..." + SERVER_READY=0 for i in {1..180}; do if ! kill -0 $SERVER_PID 2>/dev/null; then echo "❌ Server died early. Logs:" @@ -1004,10 +1005,17 @@ if [ "$suite_opt" == "9" ]; then fi if curl -sf http://127.0.0.1:5431/health > /dev/null 2>&1; then echo "Server ready (${i}s)" + SERVER_READY=1 break fi sleep 1 done + if [ $SERVER_READY -eq 0 ]; then + echo "❌ Server not ready after 180s. Logs:" + print_server_log ./tmp/kvcache_regression.log + kill $SERVER_PID 2>/dev/null + exit 1 + fi echo "" echo "Running QuantizedKVCache regression suite..." @@ -1041,8 +1049,8 @@ def call(messages, kv_bits=None, max_tokens=60, temperature=0.0): return None, str(e), time.time() - t0 elapsed = time.time() - t0 content = d["choices"][0]["message"].get("content") or "" - # Strip Gemma-4 thinking blocks - content = re.sub(r"<\|channel\|>thought.*?", "", content, flags=re.DOTALL).strip() + # Strip Gemma-4 thinking blocks — handle both <|channel|>thought and <|channel>thought variants + content = re.sub(r"<\|channel\|?>thought.*?", "", content, flags=re.DOTALL).strip() return d, content, elapsed MSGS_SHORT = [ From ccccdebfb2b323ff16359629a7a9fa707c4cf491 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:41:08 -0700 Subject: [PATCH 3/4] docs: expand Supported Models section to full architecture list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace 🧠 with 📡 heading emoji - Rewrite as structured tables (Text / Vision / Audio) with all 50+ model families derived from the actual MLXLLM + MLXVLM model file inventory - LLM table: Gemma, Qwen, Phi, Mistral, Llama, GLM, DeepSeek, Falcon, LFM2, OLMo, Granite, SmolLM3, InternLM2, Cohere, Jamba, Exaone, MiMo, Ernie, Baichuan, Bailing, NemotronH, Starcoder2, OpenELM, BitNet, MiniMax, Apertus/AfMoE, MiniCPM, Qwen3Next - VLM table: Gemma4, Gemma3, Qwen3-VL, Qwen2-VL/2.5-VL, LFM2-VL, Pixtral, PaliGemma, Idefics3, Mistral3, FastVLM, SmolVLM2, GlmOcr, QwenVL - ALM table: Gemma-4-e4b only (factually correct — Qwen2-Audio removed; it was never wired into the audio pipeline here) --- README.md | 79 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index e12b4999..789a11fc 100644 --- a/README.md +++ b/README.md @@ -89,25 +89,76 @@ Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB --- -## 🧠 Supported Models & Methodologies +## 📡 Supported Models & Methodologies -`SwiftLM` dynamically maps Apple MLX primitives to standard HuggingFace architectures, enabling complete support for the latest frontier open-weights models across modalities (Text, Vision, Audio). +`SwiftLM` dynamically maps Apple MLX primitives to standard HuggingFace architectures, enabling native Metal inference across the latest frontier open-weights models. -### Text (LLMs) -- **Gemma 4**: Fully supports both Dense (`gemma-4-e4b`) and Sparse Mixture of Experts (MoE) architectures (`gemma-4-26b`, `gemma-4-31b`). -- **Qwen 2.5 & 3**: Robust support for sliding window attention limits and custom RoPE scaling. -- **Mistral & Mixtral**: Out-of-the-box structural mappings. -- **Phi-3 & Phi-3.5**: Full 128k context parsing via Swift chunked-prefill. +### 💬 Text (LLMs) -### Vision (VLMs) +| Family | Models | Notes | +|---|---|---| +| **Gemma 4** | `gemma-4-e2b`, `gemma-4-e4b` (dense) · `gemma-4-26b-a4b`, `gemma-4-31b` (MoE) | Interleaved local + global attention; KV sharing; native quantized KV cache (issue #71 fix) | +| **Gemma 3 / 3n** | `gemma-3-*`, `gemma-3n-*` | Google Gemma 3 and nano variants | +| **Gemma / Gemma 2** | `gemma-*`, `gemma-2-*` | Original Gemma family | +| **Qwen 3.5** | `Qwen3.5-7B`, `Qwen3.5-27B`, `Qwen3.5-122B-A10B`, `Qwen3.5-397B-A22B` | Dense + MoE; SSD streaming at 10× for 122B/397B | +| **Qwen 3** | `Qwen3-*` (dense + MoE) | Sliding window + hybrid attention | +| **Qwen 2.5** | `Qwen2.5-7B`, `Qwen2.5-14B`, `Qwen2.5-72B` | Robust RoPE scaling | +| **Qwen 2** | `Qwen2-*` | Linear RoPE variants | +| **Phi 4 / PhiMoE** | `phi-4-mlx`, `Phi-3.5-MoE` | Microsoft Phi family incl. MoE | +| **Phi 3 / Phi** | `Phi-3`, `Phi-3.5-mini` | 128k context via chunked prefill | +| **Mistral / Mixtral** | `Mistral-7B`, `Mistral-4`, `Mixtral-*` | GQA + sliding window variants | +| **Llama / Llama 3** | `Llama-3.1-*`, `Llama-3.2-*`, `Llama-3.3-*` | YaRN + dynamic NTK RoPE scaling | +| **GLM 4 / GLM 5.1** | `GLM-4-*`, `GLM-5.1-RAM-270GB`, `GLM-5.1-4bit` | Dense + MoE-Lite variants | +| **DeepSeek V3** | `DeepSeek-V3-*` | MLA attention architecture | +| **Falcon H1** | `Falcon-H1-*` | Falcon hybrid SSM+attention | +| **LFM 2** | `LFM2-*`, `LFM2-MoE-*` | Liquid AI dense + MoE | +| **OLMo 2 / OLMo 3 / OLMoE** | `OLMo-2-*`, `OLMo-3-*` | AllenAI open language models | +| **Granite / GraniteMoE** | `Granite-*`, `GraniteMoE-Hybrid-*` | IBM Granite hybrid Mamba+attention | +| **SmolLM 3** | `SmolLM3-*` | HuggingFace compact LM | +| **MiniCPM** | `MiniCPM-*` | Lightweight efficient LM | +| **InternLM 2** | `InternLM2-*` | Shanghai AI Lab series | +| **Cohere / Command-R** | `Command-R-*`, `c4ai-*` | Cohere retrieval-tuned models | +| **Jamba** | `Jamba-v0.1` | AI21 hybrid Mamba+attention | +| **Exaone 4** | `EXAONE-4.0-*` | LG AI Research | +| **MiMo / MiMo V2** | `MiMo-7B-*` | Xiaomi reasoning model | +| **Ernie 4.5** | `ERNIE-4.5-*` | Baidu ERNIE series | +| **Baichuan M1** | `Baichuan-M1-*` | Baichuan multimodal base | +| **Bailing MoE** | `Ling-*` | Bailing/Ling MoE family | +| **NemotronH** | `Nemotron-H-*` | NVIDIA Nemotron hybrid | +| **Starcoder 2** | `starcoder2-*` | Code generation | +| **OpenELM** | `OpenELM-*` | Apple on-device efficient LM | +| **Apertus / AfMoE** | `Apertus-*` | Sparse MoE research models | +| **BitNet** | `bitnet-*` | 1-bit weight quantization | +| **MiniMax** | `MiniMax-Text-*` | Lightning attention architecture | +| **Olmo3** | `Olmo3-*` | AllenAI Olmo3 series | + +### 👁️ Vision (VLMs) *Run with `--vision` flag.* -- **Qwen2-VL & Qwen3-VL**: Real-time positional bounding and Metal image scaling. -- **PaliGemma / LFM2-VL / Pixtral**: Base64 spatial decomposition. -### Audio (ALMs) -*Run with `--audio` flag.* -- **Qwen2-Audio (7B-Instruct)**: Deep multi-modal spectrogram processing via Swift audio interleaving. -- **Gemma-4 Audio Pipelines**: Ready for Audio-in/Text-out variants mapping `.audio_tower` extraction parameters natively off NVMe. +| Family | Models | Notes | +|---|---|---| +| **Gemma 4** | `gemma-4-*` (VLM mode) | Native image tower via MLXVLM | +| **Gemma 3** | `gemma-3-*` (VLM mode) | PaLiGemma-style image projection | +| **Qwen3-VL / Qwen3.5-VL** | `Qwen3-VL-*`, `Qwen3.5-VL-*` | Dynamic resolution with native RoPE | +| **Qwen2-VL / Qwen2.5-VL** | `Qwen2-VL-2B/7B`, `Qwen2.5-VL-*` | Real-time positional bounding + Metal image scaling | +| **LFM2-VL** | `LFM2-VL-1.6B` | Liquid AI multimodal | +| **Pixtral** | `pixtral-12b` | Mistral vision model | +| **PaliGemma** | `paligemma-*` | Google vision-language | +| **Idefics 3** | `Idefics3-*` | HuggingFace multimodal | +| **Mistral 3** | `Mistral-Small-3.1-*` | Mistral vision variant | +| **FastVLM** | `FastVLM-*` | Apple on-device VLM | +| **SmolVLM 2** | `SmolVLM2-*` | HuggingFace compact VLM | +| **GLM OCR** | `glm-4v-*` | THUDM vision+OCR | +| **QwenVL** | `Qwen-VL-*` | Original Qwen VL | + +### 🎧 Audio (ALMs) +*Run with `--audio` flag. Only `gemma-4-e4b` variants include an audio tower.* + +| Family | Models | Notes | +|---|---|---| +| **Gemma 4 Omni** | `gemma-4-e4b-it-4bit`, `gemma-4-e4b-it-8bit` | Audio-in via vDSP STFT → Mel spectrogram (16kHz, 128 bins); text-out | + + --- From ed5f8f6db8f851680e7358fece19fff9d183c1b0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:43:42 -0700 Subject: [PATCH 4/4] docs: remove GLM 5.1 from supported models (still on feature branch, reverted from main in 50c3732) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 789a11fc..0e8fb1f8 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB | **Phi 3 / Phi** | `Phi-3`, `Phi-3.5-mini` | 128k context via chunked prefill | | **Mistral / Mixtral** | `Mistral-7B`, `Mistral-4`, `Mixtral-*` | GQA + sliding window variants | | **Llama / Llama 3** | `Llama-3.1-*`, `Llama-3.2-*`, `Llama-3.3-*` | YaRN + dynamic NTK RoPE scaling | -| **GLM 4 / GLM 5.1** | `GLM-4-*`, `GLM-5.1-RAM-270GB`, `GLM-5.1-4bit` | Dense + MoE-Lite variants | +| **GLM 4** | `GLM-4-*` | THUDM GLM-4 dense + MoE-Lite variants | | **DeepSeek V3** | `DeepSeek-V3-*` | MLA attention architecture | | **Falcon H1** | `Falcon-H1-*` | Falcon hybrid SSM+attention | | **LFM 2** | `LFM2-*`, `LFM2-MoE-*` | Liquid AI dense + MoE |