From d06b95854686cda2591ccd1d1572e6e832676470 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Tue, 26 May 2026 10:42:03 -0700
Subject: [PATCH 1/2] Update glm-5 b200 sglang container image to
 nightly-dev-cu13-20260523-c112f762

---
 .github/configs/nvidia-master.yaml |  8 ++++----
 perf-changelog.yaml                | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3d1a70d42..553e69e86 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2208,7 +2208,7 @@ qwen3.5-fp4-b200-sglang-mtp:
       - { tp: 2, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-b200-sglang:
-  image: lmsysorg/sglang:v0.5.12-cu130
+  image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: b200
@@ -2227,7 +2227,7 @@ glm5-fp8-b200-sglang:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
 
 glm5-fp8-b200-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-cu130
+  image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: b200
@@ -2307,7 +2307,7 @@ glm5-fp8-b300-sglang-mtp:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
 
 glm5-fp4-b200-sglang:
-  image: lmsysorg/sglang:v0.5.12-cu130
+  image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
   runner: b200
@@ -2328,7 +2328,7 @@ glm5-fp4-b200-sglang:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
 glm5-fp4-b200-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-cu130
+  image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
   runner: b200
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 208a2da6f..69756453e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3129,3 +3129,27 @@
   description:
     - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555
+
+- config-keys:
+    - glm5-fp4-b200-sglang
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561
+
+- config-keys:
+    - glm5-fp4-b200-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561
+
+- config-keys:
+    - glm5-fp8-b200-sglang
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561
+
+- config-keys:
+    - glm5-fp8-b200-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561

From 12cae11a40131af097975b2105e41297b5bb25fc Mon Sep 17 00:00:00 2001
From: Ankur Singh <as.ankursingh3.1@gmail.com>
Date: Thu, 28 May 2026 10:16:35 -0700
Subject: [PATCH 2/2] remove cutlass gemm backend

---
 benchmarks/single_node/glm5_fp8_b200.sh     |  1 -
 benchmarks/single_node/glm5_fp8_b200_mtp.sh |  1 -
 perf-changelog.yaml                         | 66 ++++++++++++++++++---
 3 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/glm5_fp8_b200.sh
index ccaa87b98..2cd84dddc 100755
--- a/benchmarks/single_node/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/glm5_fp8_b200.sh
@@ -45,7 +45,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --tool-call-parser glm47 \
 --reasoning-parser glm45 \
 --kv-cache-dtype fp8_e4m3 --quantization fp8 \
---fp8-gemm-backend cutlass \
 --attention-backend nsa \
 --nsa-decode-backend trtllm --nsa-prefill-backend trtllm \
 --moe-runner-backend flashinfer_trtllm \
diff --git a/benchmarks/single_node/glm5_fp8_b200_mtp.sh b/benchmarks/single_node/glm5_fp8_b200_mtp.sh
index 5e4f98533..ecd5ca0af 100755
--- a/benchmarks/single_node/glm5_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/glm5_fp8_b200_mtp.sh
@@ -46,7 +46,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --tool-call-parser glm47 \
 --reasoning-parser glm45 \
 --kv-cache-dtype fp8_e4m3 --quantization fp8 \
---fp8-gemm-backend cutlass \
 --attention-backend nsa \
 --nsa-decode-backend trtllm --nsa-prefill-backend trtllm \
 --moe-runner-backend flashinfer_trtllm \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 69756453e..b7142c023 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2449,6 +2449,15 @@
     - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1346
 
+- config-keys:
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Fix the eval result of dsr1 fp4 with fp8 blockwise combine"
+    - "Bump the image to May 19"
+    - "Add conc 512 new sweep point"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1566
+
 - config-keys:
     - kimik2.5-int4-h200-vllm
   description:
@@ -2965,6 +2974,18 @@
     - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440
 
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm-disagg
+  description:
+    - "Add Kimi-K2.5-MXFP4 FP4 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569
+
+- config-keys:
+    - minimaxm2.5-fp8-mi355x-vllm-disagg
+  description:
+    - "Add MiniMax-M2.5 FP8 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569
+
 - config-keys:
     - dsv4-fp4-mi355x-vllm
   description:
@@ -3131,24 +3152,51 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555
 
 - config-keys:
-    - glm5-fp4-b200-sglang
+    - minimaxm2.5-fp8-h200-vllm
   description:
-    - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561
+    - "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404"
+    - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354
 
 - config-keys:
-    - glm5-fp4-b200-sglang-mtp
+    - dsv4-fp4-mi355x-sglang
   description:
-    - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561
+    - "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4"
+    - "Add args to avoid kvcache pool full issue on high conc"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1568
 
 - config-keys:
-    - glm5-fp8-b200-sglang
+    - qwen3.5-fp8-h200-sglang
+    - dsr1-fp8-mi355x-sglang
   description:
-    - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561
+    - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558
+
+- config-keys:
+    - qwen3.5-fp8-mi355x-sglang-disagg
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 MI355X SGLang disaggregated prefill-decode benchmark"
+    - "Image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511"
+    - "1P+1D TP8/EP1 smoke sweep for 1k1k and 8k1k (conc 8-512); MoRI transfer backend"
+    - "Add models.yaml server flags and multinode launch script qwen3.5_fp8_mi355x_sglang-disagg.sh"
+    - "8k1k row uses dp-attn=false (matches 1k1k): with --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI, so num_shared_slots stays at the global value (1) and the (num_experts - num_shared_slots) % moe_ep_size assertion in fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared). Track upstream sglang; flip back to dp-attn=true once MoRI is added to is_deepep_class_backend() or shared-slot accounting is reconciled."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1570
+
+- config-keys:
+    - glm5-fp8-mi355x-sglang-disagg
+  description:
+    - "Add GLM-5-FP8 MI355X SGLang disaggregated prefill-decode benchmark"
+    - "Image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523 (bumped from .v0.5.12-...-20260517 to unlock the PD-disagg MoRI overlay; matches chun-chang/sglang-disagg-qwen3.5)"
+    - "Adds patches/mori_conn.py overlay (bind-mounted via job.slurm) to fix sglang v0.5.12.post1 MoRI/PD startup crashes for hybrid-attention models (GLM-5 NSA, etc.): sender flatten, state_types plural fallback, consumer normalize, SWA/DSA rank/length normalize. Validated: GSM8K=0.971 strict/0.970 flexible on chun-chang. Auto-applied for v0.5.12.post1 images; opt-out via MORI_CONN_PATCH=skip."
+    - "1P+1D TP8/EP1 CI smoke sweep for 1k1k and 8k1k (conc 8-512)"
+    - "Add GLM-5-FP8 models.yaml flags, setup_deps.sh (aiter gluon + transformers glm_moe_dsa), GLM-5 env tuning in env.sh"
+    - "Add multinode launch script glm5_fp8_mi355x_sglang-disagg.sh; server.sh sources setup_deps.sh"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1572
 
 - config-keys:
+    - glm5-fp4-b200-sglang
+    - glm5-fp4-b200-sglang-mtp
+    - glm5-fp8-b200-sglang
     - glm5-fp8-b200-sglang-mtp
   description:
     - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"