From c1b90f174e9604d79f4676c29cc72727ceb590dd Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 Apr 2026 11:18:50 -0700
Subject: [PATCH 1/4] feat: Gemma-4 QuantizedKVCache fix + Test 9 regression
 (mlx-swift-lm b440)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Bumps mlx-swift-lm submodule to b440 (tag) / 63707c0:
  fix(Gemma4Text): dispatch QuantizedKVCache correctly in LLM attention
  (merges PR #29, closes SharpAI/SwiftLM#71)

- Server.swift: expose `kv_bits` as a per-request API field
  (ChatCompletionRequest.kvBits -> GenerateParameters.kvBits)
  enabling native MLX QuantizedKVCache without a server restart.

- run_benchmark.sh: add Test 9 — QuantizedKVCache regression suite
  [1/4] kv_bits=4 short  [2/4] kv_bits=8 short
  [3/4] kv_bits=4 long (KV-sharing path)  [4/4] baseline

  Test 9 passed on mlx-community/gemma-4-26b-a4b-it-4bit.
---
 Sources/SwiftLM/Server.swift |   5 +
 mlx-swift-lm                 |   2 +-
 run_benchmark.sh             | 184 +++++++++++++++++++++++++++++++++--
 3 files changed, 182 insertions(+), 9 deletions(-)

diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
index 17d68d37..094038b9 100644
--- a/Sources/SwiftLM/Server.swift
+++ b/Sources/SwiftLM/Server.swift
@@ -1051,6 +1051,7 @@ func handleChatCompletion(
     let params = GenerateParameters(
         maxTokens: tokenLimit,
         maxKVSize: config.ctxSize,
+        kvBits: chatReq.kvBits,
         temperature: temperature,
         topP: topP,
         topK: topK,
@@ -2305,6 +2306,9 @@ struct ChatCompletionRequest: Decodable {
     let chatTemplateKwargs: [String: Bool]?
     /// Top-level thinking override emitted by Aegis-AI gateway
     let enableThinking: Bool?
+    /// Number of bits for native MLX quantized KV cache (nil = no quantization, 4 or 8 typical).
+    /// Enables `QuantizedKVCache` instead of `KVCacheSimple`.  Separate from `--turbo-kv`.
+    let kvBits: Int?
 
     enum CodingKeys: String, CodingKey {
         case model, messages, stream, temperature, tools, stop, seed
@@ -2319,6 +2323,7 @@ struct ChatCompletionRequest: Decodable {
         case responseFormat = "response_format"
         case chatTemplateKwargs = "chat_template_kwargs"
         case enableThinking = "enable_thinking"
+        case kvBits = "kv_bits"
     }
 }
 
diff --git a/mlx-swift-lm b/mlx-swift-lm
index 71a77e07..63707c0c 160000
--- a/mlx-swift-lm
+++ b/mlx-swift-lm
@@ -1 +1 @@
-Subproject commit 71a77e07b4936599cc40c4a423458c2bc834a0cc
+Subproject commit 63707c0ccde78daa63ceb0575af52edc9d941c07
diff --git a/run_benchmark.sh b/run_benchmark.sh
index 8ad40921..92b47b61 100755
--- a/run_benchmark.sh
+++ b/run_benchmark.sh
@@ -102,8 +102,9 @@ echo "5) Test 5: ALM Audio End-to-End Evaluation"
 echo "6) Test 6: Omni End-to-End Evaluation"
 echo "7) Model Maintain List and Delete"
 echo "8) Test 8: Tool-Call Degeneration Regression (Gemma-4 vague-query bug)"
-echo "9) Quit"
-read -p "Option (0-9): " suite_opt
+echo "9) Test 9: Quantized KV Cache Regression (Gemma-4 issue #71 — native kv_bits)"
+echo "q) Quit"
+read -p "Option (0-9/q): " suite_opt
 
 if [ "$suite_opt" == "0" ]; then
     echo "=============================================="
@@ -131,12 +132,13 @@ if [ "$suite_opt" == "0" ]; then
     exit 0
 fi
 
-if [ "$suite_opt" == "9" ] || [ "$suite_opt" == "8" ] || [ -z "$suite_opt" ]; then
-    # 9 = Quit (old 8), 8 = Test 8 — only exit on 9 or blank
-    if [ "$suite_opt" == "9" ] || [ -z "$suite_opt" ]; then
-        echo "Exiting."
-        exit 0
-    fi
+if [ "$suite_opt" == "q" ] || [ -z "$suite_opt" ]; then
+    echo "Exiting."
+    exit 0
+fi
+
+if [ "$suite_opt" == "9" ] || [ "$suite_opt" == "8" ]; then
+    : # handled below — fall through
 fi
 
 if [ "$suite_opt" == "7" ]; then
@@ -969,6 +971,172 @@ EOF
     exit 0
 fi
 
+# ── Test 9: QuantizedKVCache Regression (issue #71) ────────────────────────
+# Verifies that Gemma-4 text models can decode with native MLX QuantizedKVCache
+# (kv_bits=4 and kv_bits=8) without triggering the:
+#   fatalError: `update` was called on `QuantizedKVCache`. Use `updateQuantized`.
+# crash fixed in PR #29 of mlx-swift-lm.
+#
+# Pass criteria:
+#   - 4-bit run: prefill + ≥20 decode tokens, response is non-empty coherent text
+#   - 8-bit run: same
+#   - Multi-turn run: second turn with kv_bits=4 also succeeds (exercises sharedKV path)
+if [ "$suite_opt" == "9" ]; then
+    echo ""
+    echo "=> Test 9: Quantized KV Cache Regression (issue #71) on $FULL_MODEL"
+    echo "   Tests MLX native QuantizedKVCache (kv_bits=4, kv_bits=8) — NOT TurboKV"
+    echo "   This exercises the fix in mlx-swift-lm PR #29."
+
+    echo "Starting server on port 5431..."
+    killall SwiftLM 2>/dev/null
+    mkdir -p tmp
+    # No --turbo-kv flag: we want the vanilla KVCacheSimple path that will be
+    # upgraded to QuantizedKVCache by the per-request kv_bits field.
+    $BIN --model "$FULL_MODEL" --port 5431 --stream-experts --ctx-size 8192 > ./tmp/kvcache_regression.log 2>&1 &
+    SERVER_PID=$!
+
+    echo "Waiting for server (up to 180s)..."
+    for i in {1..180}; do
+        if ! kill -0 $SERVER_PID 2>/dev/null; then
+            echo "❌ Server died early. Logs:"
+            print_server_log ./tmp/kvcache_regression.log
+            exit 1
+        fi
+        if curl -sf http://127.0.0.1:5431/health > /dev/null 2>&1; then
+            echo "Server ready (${i}s)"
+            break
+        fi
+        sleep 1
+    done
+
+    echo ""
+    echo "Running QuantizedKVCache regression suite..."
+
+    python3 - << 'KVBITS_EOF'
+import json, urllib.request, time, sys, re
+
+BASE = "http://127.0.0.1:5431"
+
+FAILS = []
+
+def call(messages, kv_bits=None, max_tokens=60, temperature=0.0):
+    payload = {
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "stream": False,
+    }
+    if kv_bits is not None:
+        payload["kv_bits"] = kv_bits
+    req = urllib.request.Request(
+        f"{BASE}/v1/chat/completions",
+        data=json.dumps(payload).encode(),
+        headers={"Content-Type": "application/json"},
+    )
+    t0 = time.time()
+    try:
+        with urllib.request.urlopen(req, timeout=180) as r:
+            d = json.loads(r.read())
+    except Exception as e:
+        return None, str(e), time.time() - t0
+    elapsed = time.time() - t0
+    content = d["choices"][0]["message"].get("content") or ""
+    # Strip Gemma-4 thinking blocks
+    content = re.sub(r"<\|channel\|>thought.*?<channel\|>", "", content, flags=re.DOTALL).strip()
+    return d, content, elapsed
+
+MSGS_SHORT = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user",   "content": "Name the three primary colours. Be brief."},
+]
+
+# Longer prompt to exercise the KV sharing layers (last 20 of Gemma-4 share KV
+# from earlier layers — the bug manifests at those layers on multi-token prefills).
+MSGS_LONG = [
+    {"role": "system", "content": "You are a knowledgeable AI assistant. Answer concisely."},
+    {"role": "user",   "content": "Explain in two sentences why the sky appears blue during the day and red at sunset. Use physics terminology."},
+]
+
+# ── [1] 4-bit quantized KV cache ──
+print("\n─── [1/4] kv_bits=4, short prompt ───")
+d, content, t = call(MSGS_SHORT, kv_bits=4)
+if d is None:
+    print(f"  ❌ CRASHED: {content}")
+    FAILS.append("kv_bits=4 short: server crash or timeout")
+else:
+    gen_toks = d["usage"]["completion_tokens"]
+    ok = len(content.strip()) > 5 and gen_toks >= 3
+    print(f"  {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:100]}")
+    if not ok:
+        FAILS.append(f"kv_bits=4 short: too few tokens or empty ({gen_toks} tokens)")
+
+# ── [2] 8-bit quantized KV cache ──
+print("\n─── [2/4] kv_bits=8, short prompt ───")
+d, content, t = call(MSGS_SHORT, kv_bits=8)
+if d is None:
+    print(f"  ❌ CRASHED: {content}")
+    FAILS.append("kv_bits=8 short: server crash or timeout")
+else:
+    gen_toks = d["usage"]["completion_tokens"]
+    ok = len(content.strip()) > 5 and gen_toks >= 3
+    print(f"  {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:100]}")
+    if not ok:
+        FAILS.append(f"kv_bits=8 short: too few tokens or empty ({gen_toks} tokens)")
+
+# ── [3] 4-bit, longer prompt (exercises KV-sharing layers) ──
+print("\n─── [3/4] kv_bits=4, longer prompt (exercises KV-sharing path) ───")
+d, content, t = call(MSGS_LONG, kv_bits=4, max_tokens=120)
+if d is None:
+    print(f"  ❌ CRASHED: {content}")
+    FAILS.append("kv_bits=4 long: server crash or timeout")
+else:
+    gen_toks = d["usage"]["completion_tokens"]
+    ok = len(content.strip()) > 10 and gen_toks >= 5
+    print(f"  {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:120]}")
+    if not ok:
+        FAILS.append(f"kv_bits=4 long: too few tokens or empty ({gen_toks} tokens)")
+
+# ── [4] Baseline without kv_bits (must still work — regression guard) ──
+print("\n─── [4/4] kv_bits=None baseline (no quantization) ───")
+d, content, t = call(MSGS_SHORT, kv_bits=None)
+if d is None:
+    print(f"  ❌ CRASHED: {content}")
+    FAILS.append("baseline (no kv_bits): server crash or timeout")
+else:
+    gen_toks = d["usage"]["completion_tokens"]
+    ok = len(content.strip()) > 5 and gen_toks >= 3
+    print(f"  {'✅' if ok else '❌'} [{t:.1f}s, {gen_toks} tokens]: {content[:100]}")
+    if not ok:
+        FAILS.append(f"baseline: too few tokens or empty ({gen_toks} tokens)")
+
+print("\n" + "─" * 60)
+if not FAILS:
+    print("✅  REGRESSION PASSED — QuantizedKVCache dispatches correctly.")
+    print("   kv_bits=4 ✓  |  kv_bits=8 ✓  |  KV-sharing path ✓  |  baseline ✓")
+    sys.exit(0)
+else:
+    print("❌  REGRESSION FAILED:")
+    for f in FAILS:
+        print(f"    • {f}")
+    print("\n   Root cause (if kv_bits runs crash): unconditional `cache.update()` call")
+    print("   in Gemma4TextAttention.callAsFunction — see mlx-swift-lm PR #29.")
+    sys.exit(1)
+KVBITS_EOF
+    TEST9_EXIT=$?
+
+    echo ""
+    echo "Cleaning up..."
+    kill $SERVER_PID 2>/dev/null
+    wait $SERVER_PID 2>/dev/null
+
+    if [ $TEST9_EXIT -eq 0 ]; then
+        echo "✅ Test 9 PASSED"
+    else
+        echo "❌ Test 9 FAILED — see output above."
+    fi
+    exit $TEST9_EXIT
+fi
+
 # Fallback to Test 1 for anything else
 echo ""
 read -p "Enter context lengths to test [default: 512,40000,100000]: " CONTEXTS

From f007b3baa32fba5ca7e8777edb3ed1603eda9ea8 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 Apr 2026 11:33:10 -0700
Subject: [PATCH 2/4] docs+fix: kv_bits README docs + address Copilot review on
 PR #73
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

README.md:
- Added '🔧 Per-Request API Parameters' section with kv_bits table,
  kv_bits vs --turbo-kv comparison table, and curl usage example
- Clarified --turbo-kv CLI entry: 'activates after 2048 tokens, server-wide'

Server.swift:
- Added kv_bits input validation (only nil/4/8 accepted; returns 400 otherwise)
- Bypass prompt cache restore when kv_bits is set (prevents unsafe mixing of
  QuantizedKVCache and KVCacheSimple states across requests)
- Bypass prompt cache save when kv_bits is set (same safety reason)

run_benchmark.sh (Test 9):
- Corrected header comment to match actual assertions (removed false ≥20 token
  and multi-turn claims; stated actual ≥3 token / non-empty checks)
- Added explicit SERVER_READY flag + post-loop failure with log dump
- Widened thinking-block regex to handle both <|channel|>thought and <|channel>thought
---
 README.md                    | 38 +++++++++++++++++++++++++++++++++++-
 Sources/SwiftLM/Server.swift | 25 +++++++++++++++++++++---
 run_benchmark.sh             | 18 ++++++++++++-----
 3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 9bf4bacd..e12b4999 100644
--- a/README.md
+++ b/README.md
@@ -352,10 +352,46 @@ curl http://localhost:5413/v1/chat/completions \
 | `--min-p` | `0.0` | Default min-p sampling threshold relative to the highest probability token (0 disables) |
 | `--gpu-layers` | `model_default`| Restrict the amount of layers allocated to GPU hardware |
 | `--stream-experts` | `false` | Enable SSD expert streaming for MoE models (10x speedup) |
-| `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression |
+| `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression (activates after 2048 tokens, server-wide) |
 | `--draft-model` | (none) | Draft model path/ID for speculative decoding (in-RAM models only) |
 | `--num-draft-tokens` | `4` | Number of draft tokens per speculation round |
 
+## 🔧 Per-Request API Parameters
+
+In addition to the standard OpenAI fields (`temperature`, `top_p`, `max_tokens`, etc.), SwiftLM accepts the following **SwiftLM-specific** fields on `POST /v1/chat/completions`:
+
+| Field | Type | Description |
+|---|---|---|
+| `kv_bits` | `int` (4 or 8) | Enable **MLX-native quantized KV cache** for this request. Uses `QuantizedKVCache` (standard group quantization) instead of `KVCacheSimple`. Separate from `--turbo-kv`. Reduces KV memory ~2–4× at mild quality cost. |
+| `enable_thinking` | `bool` | Force-enable or disable chain-of-thought thinking blocks for Gemma-4 / Qwen3. |
+| `kv_group_size` | `int` | Group size for `kv_bits` quantization (default: `64`). |
+| `top_k` | `int` | Per-request top-k sampling override (0 = disabled). |
+| `min_p` | `float` | Per-request min-p sampling threshold (0 = disabled). |
+| `repetition_penalty` | `float` | Token repetition penalty (e.g. `1.15`). |
+
+### `kv_bits` vs `--turbo-kv` — What's the difference?
+
+| | `kv_bits` (per-request) | `--turbo-kv` (server flag) |
+|---|---|---|
+| **Scope** | Per-request, sent in JSON body | Server-wide, set at startup |
+| **Algorithm** | MLX-native group quantization (4-bit / 8-bit) | Custom 3-bit PolarQuant + QJL Walsh-Hadamard |
+| **Activation** | From token 0 | After 2048 tokens |
+| **Memory savings** | ~2–4× vs FP16 | ~3.5× vs FP16 |
+| **Use case** | Targeted memory reduction per conversation | Extreme long-context (100K+) compression |
+
+### Example: Enable 4-bit KV cache per request
+```bash
+curl http://localhost:5413/v1/chat/completions \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "model": "gemma-4-26b-a4b-it-4bit",
+    "kv_bits": 4,
+    "messages": [
+      {"role": "user", "content": "Summarize the history of computing in 3 sentences."}
+    ]
+  }'
+```
+
 ## 📦 Requirements
 
 - macOS 14.0+
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
index 094038b9..c6e416d3 100644
--- a/Sources/SwiftLM/Server.swift
+++ b/Sources/SwiftLM/Server.swift
@@ -1048,6 +1048,16 @@ func handleChatCompletion(
         // These are accepted but may not affect generation if MLX doesn't support them
     }
 
+    // ── Validate kv_bits: only nil, 4, and 8 are supported ──
+    if let kb = chatReq.kvBits, kb != 4 && kb != 8 {
+        let errBody = "{\"error\":{\"message\":\"Invalid kv_bits value \(kb). Supported values are 4 and 8.\",\"type\":\"invalid_request_error\",\"code\":\"invalid_kv_bits\"}}"
+        return Response(
+            status: .badRequest,
+            headers: jsonHeaders(),
+            body: .init(byteBuffer: ByteBuffer(string: errBody))
+        )
+    }
+
     let params = GenerateParameters(
         maxTokens: tokenLimit,
         maxKVSize: config.ctxSize,
@@ -1201,9 +1211,13 @@ func handleChatCompletion(
         // raw <|image|>/<|audio|> token embeddings instead of the projected features.
         let isMultimodalRequest = lmInput.image != nil || lmInput.audio != nil
 
-        // Try to restore via token-by-token prefix match (llama-server style)
+        // Try to restore via token-by-token prefix match (llama-server style).
+        // Skip for quantized-KV requests: the prompt cache stores KV state produced
+        // with KVCacheSimple; restoring it into a QuantizedKVCache (or vice-versa)
+        // is unsafe and produces incorrect results or runtime failures.
+        let skipPromptCache = isMultimodalRequest || params.kvBits != nil
         var stream: AsyncStream<Generation>
-        if !isMultimodalRequest, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) {
+        if !skipPromptCache, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) {
             // Cache hit: KV state is pre-populated up to cachedCount tokens.
             // Only compute the remaining (new) tokens.
             var startIndex = cachedCount
@@ -1252,6 +1266,10 @@ func handleChatCompletion(
         let onPrefillDone: (() async -> Void)? = {
             if turboHasCompressed {
                 print("[SwiftLM] 🧠 Skipping prompt cache save — TurboQuant has compressed \(cache.compactMap { ($0 as? KVCacheSimple)?.compressedOffset }.max() ?? 0) tokens. Saving would decode ~37 GB back to fp16.")
+            } else if params.kvBits != nil {
+                // kv_bits is set: the cache contains QuantizedKVCache layers whose token
+                // format is incompatible with the FP16 KVCacheSimple format expected by
+                // promptCache.save. Skip saving to prevent unsafe mixed-format restores.
             } else {
                 await promptCache.save(tokens: promptTokens, cache: cache)
             }
@@ -2306,7 +2324,8 @@ struct ChatCompletionRequest: Decodable {
     let chatTemplateKwargs: [String: Bool]?
     /// Top-level thinking override emitted by Aegis-AI gateway
     let enableThinking: Bool?
-    /// Number of bits for native MLX quantized KV cache (nil = no quantization, 4 or 8 typical).
+    /// Number of bits for native MLX quantized KV cache (nil = no quantization).
+    /// Only 4 and 8 are supported by the underlying MLX QuantizedKVCache.
     /// Enables `QuantizedKVCache` instead of `KVCacheSimple`.  Separate from `--turbo-kv`.
     let kvBits: Int?
 
diff --git a/run_benchmark.sh b/run_benchmark.sh
index 92b47b61..88b1dc86 100755
--- a/run_benchmark.sh
+++ b/run_benchmark.sh
@@ -978,9 +978,10 @@ fi
 # crash fixed in PR #29 of mlx-swift-lm.
 #
 # Pass criteria:
-#   - 4-bit run: prefill + ≥20 decode tokens, response is non-empty coherent text
+#   - 4-bit run: server does not crash, returns non-empty text response (≥3 tokens)
 #   - 8-bit run: same
-#   - Multi-turn run: second turn with kv_bits=4 also succeeds (exercises sharedKV path)
+#   - Longer prompt run: exercises the last-20-layer KV-sharing path, same pass criteria
+#   - Baseline (no kv_bits): regression guard that the non-quantized path still works
 if [ "$suite_opt" == "9" ]; then
     echo ""
     echo "=> Test 9: Quantized KV Cache Regression (issue #71) on $FULL_MODEL"
@@ -995,7 +996,7 @@ if [ "$suite_opt" == "9" ]; then
     $BIN --model "$FULL_MODEL" --port 5431 --stream-experts --ctx-size 8192 > ./tmp/kvcache_regression.log 2>&1 &
     SERVER_PID=$!
 
-    echo "Waiting for server (up to 180s)..."
+    SERVER_READY=0
     for i in {1..180}; do
         if ! kill -0 $SERVER_PID 2>/dev/null; then
             echo "❌ Server died early. Logs:"
@@ -1004,10 +1005,17 @@ if [ "$suite_opt" == "9" ]; then
         fi
         if curl -sf http://127.0.0.1:5431/health > /dev/null 2>&1; then
             echo "Server ready (${i}s)"
+            SERVER_READY=1
             break
         fi
         sleep 1
     done
+    if [ $SERVER_READY -eq 0 ]; then
+        echo "❌ Server not ready after 180s. Logs:"
+        print_server_log ./tmp/kvcache_regression.log
+        kill $SERVER_PID 2>/dev/null
+        exit 1
+    fi
 
     echo ""
     echo "Running QuantizedKVCache regression suite..."
@@ -1041,8 +1049,8 @@ def call(messages, kv_bits=None, max_tokens=60, temperature=0.0):
         return None, str(e), time.time() - t0
     elapsed = time.time() - t0
     content = d["choices"][0]["message"].get("content") or ""
-    # Strip Gemma-4 thinking blocks
-    content = re.sub(r"<\|channel\|>thought.*?<channel\|>", "", content, flags=re.DOTALL).strip()
+    # Strip Gemma-4 thinking blocks — handle both <|channel|>thought and <|channel>thought variants
+    content = re.sub(r"<\|channel\|?>thought.*?<channel\|?>", "", content, flags=re.DOTALL).strip()
     return d, content, elapsed
 
 MSGS_SHORT = [

From ccccdebfb2b323ff16359629a7a9fa707c4cf491 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 Apr 2026 11:41:08 -0700
Subject: [PATCH 3/4] docs: expand Supported Models section to full
 architecture list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace 🧠 with 📡 heading emoji
- Rewrite as structured tables (Text / Vision / Audio) with all 50+ model
  families derived from the actual MLXLLM + MLXVLM model file inventory
- LLM table: Gemma, Qwen, Phi, Mistral, Llama, GLM, DeepSeek, Falcon,
  LFM2, OLMo, Granite, SmolLM3, InternLM2, Cohere, Jamba, Exaone, MiMo,
  Ernie, Baichuan, Bailing, NemotronH, Starcoder2, OpenELM, BitNet,
  MiniMax, Apertus/AfMoE, MiniCPM, Qwen3Next
- VLM table: Gemma4, Gemma3, Qwen3-VL, Qwen2-VL/2.5-VL, LFM2-VL,
  Pixtral, PaliGemma, Idefics3, Mistral3, FastVLM, SmolVLM2, GlmOcr, QwenVL
- ALM table: Gemma-4-e4b only (factually correct — Qwen2-Audio removed;
  it was never wired into the audio pipeline here)
---
 README.md | 79 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index e12b4999..789a11fc 100644
--- a/README.md
+++ b/README.md
@@ -89,25 +89,76 @@ Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB
 
 ---
 
-## 🧠 Supported Models & Methodologies
+## 📡 Supported Models & Methodologies
 
-`SwiftLM` dynamically maps Apple MLX primitives to standard HuggingFace architectures, enabling complete support for the latest frontier open-weights models across modalities (Text, Vision, Audio).
+`SwiftLM` dynamically maps Apple MLX primitives to standard HuggingFace architectures, enabling native Metal inference across the latest frontier open-weights models.
 
-### Text (LLMs)
-- **Gemma 4**: Fully supports both Dense (`gemma-4-e4b`) and Sparse Mixture of Experts (MoE) architectures (`gemma-4-26b`, `gemma-4-31b`).
-- **Qwen 2.5 & 3**: Robust support for sliding window attention limits and custom RoPE scaling.
-- **Mistral & Mixtral**: Out-of-the-box structural mappings.
-- **Phi-3 & Phi-3.5**: Full 128k context parsing via Swift chunked-prefill.
+### 💬 Text (LLMs)
 
-### Vision (VLMs)
+| Family | Models | Notes |
+|---|---|---|
+| **Gemma 4** | `gemma-4-e2b`, `gemma-4-e4b` (dense) · `gemma-4-26b-a4b`, `gemma-4-31b` (MoE) | Interleaved local + global attention; KV sharing; native quantized KV cache (issue #71 fix) |
+| **Gemma 3 / 3n** | `gemma-3-*`, `gemma-3n-*` | Google Gemma 3 and nano variants |
+| **Gemma / Gemma 2** | `gemma-*`, `gemma-2-*` | Original Gemma family |
+| **Qwen 3.5** | `Qwen3.5-7B`, `Qwen3.5-27B`, `Qwen3.5-122B-A10B`, `Qwen3.5-397B-A22B` | Dense + MoE; SSD streaming at 10× for 122B/397B |
+| **Qwen 3** | `Qwen3-*` (dense + MoE) | Sliding window + hybrid attention |
+| **Qwen 2.5** | `Qwen2.5-7B`, `Qwen2.5-14B`, `Qwen2.5-72B` | Robust RoPE scaling |
+| **Qwen 2** | `Qwen2-*` | Linear RoPE variants |
+| **Phi 4 / PhiMoE** | `phi-4-mlx`, `Phi-3.5-MoE` | Microsoft Phi family incl. MoE |
+| **Phi 3 / Phi** | `Phi-3`, `Phi-3.5-mini` | 128k context via chunked prefill |
+| **Mistral / Mixtral** | `Mistral-7B`, `Mistral-4`, `Mixtral-*` | GQA + sliding window variants |
+| **Llama / Llama 3** | `Llama-3.1-*`, `Llama-3.2-*`, `Llama-3.3-*` | YaRN + dynamic NTK RoPE scaling |
+| **GLM 4 / GLM 5.1** | `GLM-4-*`, `GLM-5.1-RAM-270GB`, `GLM-5.1-4bit` | Dense + MoE-Lite variants |
+| **DeepSeek V3** | `DeepSeek-V3-*` | MLA attention architecture |
+| **Falcon H1** | `Falcon-H1-*` | Falcon hybrid SSM+attention |
+| **LFM 2** | `LFM2-*`, `LFM2-MoE-*` | Liquid AI dense + MoE |
+| **OLMo 2 / OLMo 3 / OLMoE** | `OLMo-2-*`, `OLMo-3-*` | AllenAI open language models |
+| **Granite / GraniteMoE** | `Granite-*`, `GraniteMoE-Hybrid-*` | IBM Granite hybrid Mamba+attention |
+| **SmolLM 3** | `SmolLM3-*` | HuggingFace compact LM |
+| **MiniCPM** | `MiniCPM-*` | Lightweight efficient LM |
+| **InternLM 2** | `InternLM2-*` | Shanghai AI Lab series |
+| **Cohere / Command-R** | `Command-R-*`, `c4ai-*` | Cohere retrieval-tuned models |
+| **Jamba** | `Jamba-v0.1` | AI21 hybrid Mamba+attention |
+| **Exaone 4** | `EXAONE-4.0-*` | LG AI Research |
+| **MiMo / MiMo V2** | `MiMo-7B-*` | Xiaomi reasoning model |
+| **Ernie 4.5** | `ERNIE-4.5-*` | Baidu ERNIE series |
+| **Baichuan M1** | `Baichuan-M1-*` | Baichuan multimodal base |
+| **Bailing MoE** | `Ling-*` | Bailing/Ling MoE family |
+| **NemotronH** | `Nemotron-H-*` | NVIDIA Nemotron hybrid |
+| **Starcoder 2** | `starcoder2-*` | Code generation |
+| **OpenELM** | `OpenELM-*` | Apple on-device efficient LM |
+| **Apertus / AfMoE** | `Apertus-*` | Sparse MoE research models |
+| **BitNet** | `bitnet-*` | 1-bit weight quantization |
+| **MiniMax** | `MiniMax-Text-*` | Lightning attention architecture |
+| **Olmo3** | `Olmo3-*` | AllenAI Olmo3 series |
+
+### 👁️ Vision (VLMs)
 *Run with `--vision` flag.*
-- **Qwen2-VL & Qwen3-VL**: Real-time positional bounding and Metal image scaling.
-- **PaliGemma / LFM2-VL / Pixtral**: Base64 spatial decomposition.
 
-### Audio (ALMs)
-*Run with `--audio` flag.*
-- **Qwen2-Audio (7B-Instruct)**: Deep multi-modal spectrogram processing via Swift audio interleaving.
-- **Gemma-4 Audio Pipelines**: Ready for Audio-in/Text-out variants mapping `.audio_tower` extraction parameters natively off NVMe.
+| Family | Models | Notes |
+|---|---|---|
+| **Gemma 4** | `gemma-4-*` (VLM mode) | Native image tower via MLXVLM |
+| **Gemma 3** | `gemma-3-*` (VLM mode) | PaLiGemma-style image projection |
+| **Qwen3-VL / Qwen3.5-VL** | `Qwen3-VL-*`, `Qwen3.5-VL-*` | Dynamic resolution with native RoPE |
+| **Qwen2-VL / Qwen2.5-VL** | `Qwen2-VL-2B/7B`, `Qwen2.5-VL-*` | Real-time positional bounding + Metal image scaling |
+| **LFM2-VL** | `LFM2-VL-1.6B` | Liquid AI multimodal |
+| **Pixtral** | `pixtral-12b` | Mistral vision model |
+| **PaliGemma** | `paligemma-*` | Google vision-language |
+| **Idefics 3** | `Idefics3-*` | HuggingFace multimodal |
+| **Mistral 3** | `Mistral-Small-3.1-*` | Mistral vision variant |
+| **FastVLM** | `FastVLM-*` | Apple on-device VLM |
+| **SmolVLM 2** | `SmolVLM2-*` | HuggingFace compact VLM |
+| **GLM OCR** | `glm-4v-*` | THUDM vision+OCR |
+| **QwenVL** | `Qwen-VL-*` | Original Qwen VL |
+
+### 🎧 Audio (ALMs)
+*Run with `--audio` flag. Only `gemma-4-e4b` variants include an audio tower.*
+
+| Family | Models | Notes |
+|---|---|---|
+| **Gemma 4 Omni** | `gemma-4-e4b-it-4bit`, `gemma-4-e4b-it-8bit` | Audio-in via vDSP STFT → Mel spectrogram (16kHz, 128 bins); text-out |
+
+
 
 ---
 

From ed5f8f6db8f851680e7358fece19fff9d183c1b0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 Apr 2026 11:43:42 -0700
Subject: [PATCH 4/4] docs: remove GLM 5.1 from supported models (still on
 feature branch, reverted from main in 50c3732)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 789a11fc..0e8fb1f8 100644
--- a/README.md
+++ b/README.md
@@ -108,7 +108,7 @@ Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB
 | **Phi 3 / Phi** | `Phi-3`, `Phi-3.5-mini` | 128k context via chunked prefill |
 | **Mistral / Mixtral** | `Mistral-7B`, `Mistral-4`, `Mixtral-*` | GQA + sliding window variants |
 | **Llama / Llama 3** | `Llama-3.1-*`, `Llama-3.2-*`, `Llama-3.3-*` | YaRN + dynamic NTK RoPE scaling |
-| **GLM 4 / GLM 5.1** | `GLM-4-*`, `GLM-5.1-RAM-270GB`, `GLM-5.1-4bit` | Dense + MoE-Lite variants |
+| **GLM 4** | `GLM-4-*` | THUDM GLM-4 dense + MoE-Lite variants |
 | **DeepSeek V3** | `DeepSeek-V3-*` | MLA attention architecture |
 | **Falcon H1** | `Falcon-H1-*` | Falcon hybrid SSM+attention |
 | **LFM 2** | `LFM2-*`, `LFM2-MoE-*` | Liquid AI dense + MoE |